Compare commits

..

6 Commits

Author SHA1 Message Date
Patrick Devine
cb576a6b23 fix ref 2024-08-26 19:59:33 -07:00
Patrick Devine
15b7ff3a89 more comments 2024-08-26 19:56:45 -07:00
Patrick Devine
3ad243466b comments 2024-08-26 19:54:06 -07:00
Patrick Devine
a13e583c49 cleanup whitespace 2024-08-26 18:09:21 -07:00
Patrick Devine
3c1994d0ee small change 2024-08-26 18:07:59 -07:00
Patrick Devine
1b2da3829d update the import docs 2024-08-26 18:04:46 -07:00
152 changed files with 26458 additions and 5050 deletions

View File

@@ -7,5 +7,3 @@ llm/llama.cpp
.env
.cache
test_data
llm/build
llama/build

View File

@@ -102,8 +102,7 @@ jobs:
with:
name: generate-windows-cpu
path: |
build/**/*
build/**/*.a
llm/build/**/bin/*
llm/build/**/*.a
dist/windows-amd64/**
@@ -177,7 +176,7 @@ jobs:
with:
name: generate-windows-rocm
path: |
build/**/*
llm/build/**/bin/*
dist/windows-amd64/**
- uses: actions/upload-artifact@v4
with:
@@ -266,7 +265,7 @@ jobs:
with:
name: generate-windows-cuda-${{ matrix.cuda.version }}
path: |
build/**/*
llm/build/**/bin/*
dist/windows-amd64/**
- uses: actions/upload-artifact@v4
with:
@@ -274,134 +273,7 @@ jobs:
path: dist/deps/*
# windows arm64 generate, go build, and zip file (no installer)
# Output of this build is aggregated into the final x86 build
# for a unified windows installer
windows-arm64:
runs-on: windows-arm64
environment: release
env:
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
steps:
# The current Windows arm64 beta image has effectively zero dev tools installed...
- name: Install git and gzip
run: |
Set-ExecutionPolicy Bypass -Scope Process -Force
[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072
iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
choco install -y --no-progress git gzip
echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\ProgramData\chocolatey\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- name: Install Visual Studio 2022
run: |
$components = @(
"Microsoft.VisualStudio.Component.CoreEditor",
"Microsoft.VisualStudio.Workload.CoreEditor",
"Microsoft.VisualStudio.Component.Roslyn.Compiler",
"Microsoft.Component.MSBuild",
"Microsoft.VisualStudio.Component.TextTemplating",
"Microsoft.VisualStudio.Component.Debugger.JustInTime",
"Microsoft.VisualStudio.Component.VC.CoreIde",
"Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
"Microsoft.VisualStudio.Component.Windows11SDK.22621",
"Microsoft.VisualStudio.Component.VC.Tools.ARM64EC",
"Microsoft.VisualStudio.Component.VC.Tools.ARM64",
"Microsoft.VisualStudio.Component.VC.ATL",
"Microsoft.VisualStudio.Component.VC.ATL.ARM64",
"Microsoft.VisualStudio.Component.Graphics",
"Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
"Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core",
"Microsoft.VisualStudio.Component.Windows11Sdk.WindowsPerformanceToolkit",
"Microsoft.VisualStudio.Component.CppBuildInsights",
"Microsoft.VisualStudio.Component.VC.DiagnosticTools",
"Microsoft.VisualStudio.ComponentGroup.WebToolsExtensions.CMake",
"Microsoft.VisualStudio.Component.VC.CMake.Project",
"Microsoft.VisualStudio.Component.VC.ASAN",
"Microsoft.VisualStudio.Component.Vcpkg",
"Microsoft.VisualStudio.Workload.NativeDesktop"
)
$config = @{
"version" = "1.0"
"components" = $components
"extensions" = @()
}
$configPath = "${env:RUNNER_TEMP}\vsconfig"
$config | ConvertTo-Json | Out-File -FilePath $configPath
$bootstrapperFilePath = "${env:RUNNER_TEMP}\vs_community.exe"
write-host "Downloading Visual Studio 2022"
Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_community.exe" -outfile $bootstrapperFilePath
$bootstrapperArgumentList = ('/c', $bootstrapperFilePath, '--config', $configPath, '--quiet', '--wait' )
write-host "Installing Visual Studio 2022"
$process = Start-Process -FilePath cmd.exe -ArgumentList $bootstrapperArgumentList -Wait -PassThru
$exitCode = $process.ExitCode
write-host $exitCode
# pacman in mingw/msys2 is ~broken on windows arm right now - hangs consistently during attempts to install
# so we'll use this alternative GCC binary
- name: Install llvm-mingw GCC
run: |
$gcc_url="https://github.com/mstorsjo/llvm-mingw/releases/download/20240619/llvm-mingw-20240619-ucrt-aarch64.zip"
write-host "Downloading llvm-mingw"
Invoke-WebRequest -Uri "${gcc_url}" -OutFile "${env:RUNNER_TEMP}\gcc.zip"
write-host "Unpacking llvm-mingw"
expand-archive -path "${env:RUNNER_TEMP}\gcc.zip" -destinationpath "c:\"
mv c:\llvm-mingw-* c:\llvm-mingw
echo "c:\llvm-mingw\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- name: Verify GCC
run: |
echo $env:PATH
gcc --version
- uses: actions/checkout@v4
- name: Set Version
run: |
$ver=${env:GITHUB_REF_NAME}.trim("v")
echo VERSION=$ver | Out-File -FilePath ${env:GITHUB_ENV} -Encoding utf8 -Append
- uses: 'google-github-actions/auth@v2'
with:
project_id: 'ollama'
credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
- run: echo "${{ vars.OLLAMA_CERT }}" | Out-File -FilePath ollama_inc.crt -Encoding utf8
- name: install Windows SDK 8.1 to get signtool
run: |
$ErrorActionPreference = "Stop"
write-host "downloading SDK"
Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
write-host "Win SDK 8.1 installed"
gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
- name: install signing plugin
run: |
$ErrorActionPreference = "Stop"
write-host "downloading plugin"
Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
write-host "Installing plugin"
& "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
write-host "plugin installed"
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
cache: true
- run: go get ./...
- run: |
$gopath=(get-command go).source | split-path -parent
$gccpath=(get-command gcc).source | split-path -parent
& "C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Launch-VsDevShell.ps1"
cd $env:GITHUB_WORKSPACE
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
$env:PATH="$gopath;$gccpath;$env:PATH;C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin"
echo $env:PATH
$env:ARCH="arm64"
.\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies distZip
name: 'Windows Build'
- uses: actions/upload-artifact@v4
with:
name: windows-arm64
path: |
dist/windows-arm64/**
dist/windows-arm64-app.exe
dist/ollama-windows-arm64.zip
# Import the prior generation steps plus the full arm64 build, and build the final windows assets
# Import the prior generation steps and build the final windows assets
build-windows:
environment: release
runs-on: windows
@@ -409,7 +281,6 @@ jobs:
- generate-windows-cuda
- generate-windows-rocm
- generate-windows-cpu
- windows-arm64
env:
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
steps:
@@ -467,11 +338,7 @@ jobs:
- uses: actions/download-artifact@v4
with:
name: generate-windows-rocm
- uses: actions/download-artifact@v4
with:
name: windows-arm64
path: dist
- run: dir build
- run: dir llm/build
- run: |
$gopath=(get-command go).source | split-path -parent
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
@@ -492,7 +359,9 @@ jobs:
environment: release
runs-on: linux
env:
PLATFORM: linux/amd64
OLLAMA_SKIP_MANIFEST_CREATE: '1'
BUILD_ARCH: amd64
PUSH: '1'
steps:
- uses: actions/checkout@v4
with:
@@ -500,8 +369,14 @@ jobs:
- name: Set Version
shell: bash
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- run: |
./scripts/build_linux.sh
./scripts/build_docker.sh
- uses: actions/upload-artifact@v4
with:
name: dist-linux-amd64
@@ -515,7 +390,9 @@ jobs:
environment: release
runs-on: linux-arm64
env:
PLATFORM: linux/arm64
OLLAMA_SKIP_MANIFEST_CREATE: '1'
BUILD_ARCH: arm64
PUSH: '1'
steps:
- uses: actions/checkout@v4
with:
@@ -544,8 +421,14 @@ jobs:
sudo usermod -aG docker $USER
sudo apt-get install acl
sudo setfacl --modify user:$USER:rw /var/run/docker.sock
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- run: |
./scripts/build_linux.sh
./scripts/build_docker.sh
- uses: actions/upload-artifact@v4
with:
name: dist-linux-arm64
@@ -553,178 +436,6 @@ jobs:
dist/*linux*
!dist/*-cov
# Container image build
build-container-image:
environment: release
strategy:
matrix:
runner:
- linux
- linux-arm64
runs-on: ${{ matrix.runner }}
env:
FINAL_IMAGE_REPO: ollama/ollama
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: 'Install Docker'
if: ${{ startsWith(matrix.runner, 'linux-arm64') }}
run: |
sudo apt-get update
sudo apt-get install -y ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
sudo usermod -aG docker $USER
sudo apt-get install acl
sudo setfacl --modify user:$USER:rw /var/run/docker.sock
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.FINAL_IMAGE_REPO }}
flavor: |
latest=false
tags: |
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
type=semver,pattern={{version}}
- name: Set Version
shell: bash
run: |
machine=$(uname -m)
case ${machine} in
x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
esac >>$GITHUB_ENV
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- name: Build and push by digest
id: build
uses: docker/build-push-action@v6
with:
context: "."
platforms: linux/${{ env.ARCH }}
build-args: |
GOFLAGS
outputs: type=image,name=${{ env.FINAL_IMAGE_REPO }},push-by-digest=true,name-canonical=true,push=true
- name: Export digest
run: |
mkdir -p /tmp/digests
digest="${{ steps.build.outputs.digest }}"
touch "/tmp/digests/${digest#sha256:}"
- name: Upload digest
uses: actions/upload-artifact@v4
with:
name: digests-${{ env.PLATFORM_PAIR }}
path: /tmp/digests/*
if-no-files-found: error
retention-days: 1
merge:
environment: release
runs-on: linux
needs:
- build-container-image
env:
FINAL_IMAGE_REPO: ollama/ollama
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Download digests
uses: actions/download-artifact@v4
with:
path: /tmp/digests
pattern: digests-*
merge-multiple: true
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.FINAL_IMAGE_REPO }}
flavor: |
latest=false
tags: |
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
type=semver,pattern={{version}}
- name: Set Version
shell: bash
run: |
machine=$(uname -m)
case ${machine} in
x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
esac >>$GITHUB_ENV
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- name: Create manifest list and push
working-directory: /tmp/digests
run: |
docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
$(printf '${{ env.FINAL_IMAGE_REPO }}@sha256:%s ' *)
- name: Inspect image
run: |
docker buildx imagetools inspect ${{ env.FINAL_IMAGE_REPO }}:${{ steps.meta.outputs.version }}
build-container-image-rocm:
environment: release
runs-on: linux
env:
FINAL_IMAGE_REPO: ollama/ollama
ARCH: amd64
PLATFORM_PAIR: linux-amd64
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.FINAL_IMAGE_REPO }}
flavor: |
latest=false
tags: |
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
type=semver,pattern={{version}}
- name: Set Version
shell: bash
run: |
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- name: Build and push by digest
id: build
uses: docker/build-push-action@v6
with:
context: "."
target: runtime-rocm
build-args: |
GOFLAGS
tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm
push: true
# Aggregate all the assets and ship a release
release:
needs:
@@ -737,6 +448,8 @@ jobs:
permissions:
contents: write
env:
OLLAMA_SKIP_IMAGE_BUILD: '1'
PUSH: '1'
GH_TOKEN: ${{ github.token }}
steps:
- uses: actions/checkout@v4
@@ -745,6 +458,12 @@ jobs:
run: |
echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- run: ./scripts/build_docker.sh
- name: Retrieve built artifact
uses: actions/download-artifact@v4
with:
@@ -755,6 +474,8 @@ jobs:
ls -lh dist/
(cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
mv sha256sum.txt dist/
mv dist/linux-???64 .
mv dist/linux-amd64-rocm .
cat dist/sha256sum.txt
- name: Create or update Release
run: |

View File

@@ -81,6 +81,12 @@ jobs:
if: ${{ ! startsWith(matrix.os, 'windows-') }}
name: 'Unix Go Generate'
- run: go build .
- uses: actions/upload-artifact@v4
with:
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
path: |
llm/build/**/bin/*
llm/build/**/*.a
generate-cuda:
needs: [changes]
if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
@@ -108,6 +114,12 @@ jobs:
go generate -x ./...
env:
OLLAMA_SKIP_CPU_GENERATE: '1'
- uses: actions/upload-artifact@v4
with:
name: cuda-${{ matrix.cuda-version }}-libraries
path: |
llm/build/**/bin/*
dist/windows-amd64/**
generate-rocm:
needs: [changes]
if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
@@ -135,6 +147,12 @@ jobs:
go generate -x ./...
env:
OLLAMA_SKIP_CPU_GENERATE: '1'
- uses: actions/upload-artifact@v4
with:
name: rocm-${{ matrix.rocm-version }}-libraries
path: |
llm/build/**/bin/*
dist/windows-amd64/**
# ROCm generation step
generate-windows-rocm:
@@ -171,6 +189,7 @@ jobs:
name: go generate
env:
OLLAMA_SKIP_CPU_GENERATE: '1'
# TODO - do we need any artifacts?
# CUDA generation step
generate-windows-cuda:
@@ -212,6 +231,7 @@ jobs:
go generate -x ./...
env:
OLLAMA_SKIP_CPU_GENERATE: '1'
# TODO - do we need any artifacts?
lint:
strategy:
@@ -243,6 +263,14 @@ jobs:
arm64) echo ARCH=arm64 ;;
esac >>$GITHUB_ENV
shell: bash
- run: |
mkdir -p llm/build/linux/$ARCH/stub/bin
touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
if: ${{ startsWith(matrix.os, 'ubuntu-') }}
- run: |
mkdir -p llm/build/darwin/$ARCH/stub/bin
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
if: ${{ startsWith(matrix.os, 'macos-') }}
- uses: golangci/golangci-lint-action@v6
with:
args: --timeout 8m0s -v
@@ -273,10 +301,23 @@ jobs:
cache: true
- run: |
case ${{ matrix.arch }} in
amd64) echo ARCH=amd64 ;;
amd64) echo ARCH=x86_64 ;;
arm64) echo ARCH=arm64 ;;
esac >>$GITHUB_ENV
shell: bash
- run: |
mkdir -p llm/build/linux/$ARCH/stub/bin
touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
if: ${{ startsWith(matrix.os, 'ubuntu-') }}
- run: |
mkdir -p llm/build/darwin/$ARCH/stub/bin
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
if: ${{ startsWith(matrix.os, 'macos-') }}
shell: bash
- run: go generate ./...
- run: go build
- run: go test -v ./...
- uses: actions/upload-artifact@v4
with:
name: ${{ matrix.os }}-binaries
path: ollama

3
.gitignore vendored
View File

@@ -12,7 +12,4 @@ ggml-metal.metal
test_data
*.crt
llm/build
build/*/*/*
!build/**/placeholder
llama/build
__debug_bin*

View File

@@ -32,10 +32,6 @@ linters:
linters-settings:
gci:
sections: [standard, default, localmodule]
staticcheck:
checks:
- all
- -SA1019 # omit Deprecated check
severity:
default-severity: error
rules:

View File

@@ -18,7 +18,7 @@ See the [development documentation](./docs/development.md) for instructions on h
* New features: new features (e.g. API fields, environment variables) add surface area to Ollama and make it harder to maintain in the long run as they cannot be removed without potentially breaking users in the future.
* Refactoring: large code improvements are important, but can be harder or take longer to review and merge.
* Documentation: small updates to fill in or correct missing documentation is helpful, however large documentation additions can be hard to maintain over time.
* Documentation: small updates to fill in or dorrect missing documentation is helpful, however large documentation additions can be hard to maintain over time.
### Issues that may not be accepted

View File

@@ -16,12 +16,12 @@ FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-1
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
ARG CUDA_V11_ARCHITECTURES
ENV GOARCH=amd64
ENV GOARCH amd64
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_SKIP_STATIC_GENERATE=1 \
OLLAMA_SKIP_CPU_GENERATE=1 \
@@ -33,12 +33,12 @@ FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-1
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
ARG CUDA_V12_ARCHITECTURES
ENV GOARCH=amd64
ENV GOARCH amd64
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_SKIP_STATIC_GENERATE=1 \
OLLAMA_SKIP_CPU_GENERATE=1 \
@@ -47,32 +47,32 @@ RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
bash gen_linux.sh
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-runner-arm64
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
ARG CUDA_V11_ARCHITECTURES
ENV GOARCH=arm64
ENV GOARCH arm64
RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
OLLAMA_SKIP_CPU_GENERATE=1 \
CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
CUDA_VARIANT="_v11" \
bash gen_linux.sh
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-runner-arm64
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-server-arm64
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
ARG CUDA_V12_ARCHITECTURES
ENV GOARCH=arm64
ENV GOARCH arm64
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_SKIP_STATIC_GENERATE=1 \
OLLAMA_SKIP_CPU_GENERATE=1 \
@@ -86,13 +86,13 @@ FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-b
ARG CMAKE_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
ENV LIBRARY_PATH=/opt/amdgpu/lib64
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
ENV LIBRARY_PATH /opt/amdgpu/lib64
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
ARG AMDGPU_TARGETS
ENV GOARCH=amd64
ENV GOARCH amd64
RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \
@@ -103,11 +103,11 @@ ARG CMAKE_VERSION
ARG GOLANG_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS
ENV GOARCH=amd64
ENV GOARCH amd64
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
@@ -128,11 +128,11 @@ ARG CMAKE_VERSION
ARG GOLANG_VERSION
COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS
ENV GOARCH=arm64
ENV GOARCH arm64
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
@@ -143,112 +143,71 @@ RUN --mount=type=cache,target=/root/.ccache \
OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
# Intermediate stages used for ./scripts/build_linux.sh
# Intermediate stage used for ./scripts/build_linux.sh
FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
ENV CGO_ENABLED=1
ENV CGO_ENABLED 1
WORKDIR /go/src/github.com/ollama/ollama
COPY . .
COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
ARG GOFLAGS
ARG CGO_CFLAGS
RUN --mount=type=cache,target=/root/.ccache \
go build -trimpath -o dist/linux-amd64/bin/ollama .
RUN cd dist/linux-$GOARCH && \
tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
RUN cd dist/linux-$GOARCH-rocm && \
tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz
# Intermediate stage used for ./scripts/build_linux.sh
FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
ENV CGO_ENABLED=1
ENV CGO_ENABLED 1
ARG GOLANG_VERSION
WORKDIR /go/src/github.com/ollama/ollama
COPY . .
COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/
COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
ARG GOFLAGS
ARG CGO_CFLAGS
RUN --mount=type=cache,target=/root/.ccache \
go build -trimpath -o dist/linux-arm64/bin/ollama .
RUN cd dist/linux-$GOARCH && \
tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
FROM --platform=linux/amd64 scratch AS dist-amd64
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
FROM --platform=linux/arm64 scratch AS dist-arm64
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
FROM dist-$TARGETARCH as dist
# Optimized container images do not cary nested payloads
FROM --platform=linux/amd64 static-build-amd64 AS container-build-amd64
WORKDIR /go/src/github.com/ollama/ollama
COPY . .
ARG GOFLAGS
ARG CGO_CFLAGS
RUN --mount=type=cache,target=/root/.ccache \
go build -trimpath -o dist/linux-amd64/bin/ollama .
FROM --platform=linux/arm64 static-build-arm64 AS container-build-arm64
WORKDIR /go/src/github.com/ollama/ollama
COPY . .
COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
ARG GOFLAGS
ARG CGO_CFLAGS
RUN --mount=type=cache,target=/root/.ccache \
go build -trimpath -o dist/linux-arm64/bin/ollama .
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
RUN apt-get update && \
apt-get install -y ca-certificates && \
apt-get clean && rm -rf /var/lib/apt/lists/*
COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
# Strip out ROCm dependencies to keep the primary image lean
FROM --platform=linux/amd64 ubuntu:22.04 as amd64-libs-without-rocm
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/
RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa*
FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
RUN apt-get update && \
apt-get install -y ca-certificates && \
apt-get clean && rm -rf /var/lib/apt/lists/*
COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
# Runtime stages
FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
COPY --from=amd64-libs-without-rocm /scratch/ /lib/
RUN apt-get update && apt-get install -y ca-certificates
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
# Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
# across releases
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
RUN apt-get update && \
apt-get install -y ca-certificates && \
apt-get clean && rm -rf /var/lib/apt/lists/*
COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
RUN apt-get update && apt-get install -y ca-certificates
COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
# Radeon images are much larger so we keep it distinct from the CPU/CUDA image
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm
RUN update-pciids
COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
RUN ln -s /opt/rocm/lib /lib/ollama
EXPOSE 11434
ENV OLLAMA_HOST=0.0.0.0
ENV OLLAMA_HOST 0.0.0.0
ENTRYPOINT ["/bin/ollama"]
CMD ["serve"]
FROM runtime-$TARGETARCH
EXPOSE 11434
ENV OLLAMA_HOST=0.0.0.0
ENV OLLAMA_HOST 0.0.0.0
ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility

View File

@@ -35,10 +35,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
## Quickstart
To run and chat with [Llama 3.2](https://ollama.com/library/llama3.2):
To run and chat with [Llama 3.1](https://ollama.com/library/llama3.1):
```
ollama run llama3.2
ollama run llama3.1
```
## Model library
@@ -49,8 +49,6 @@ Here are some example models that can be downloaded:
| Model | Parameters | Size | Download |
| ------------------ | ---------- | ----- | ------------------------------ |
| Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` |
| Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` |
| Llama 3.1 | 8B | 4.7GB | `ollama run llama3.1` |
| Llama 3.1 | 70B | 40GB | `ollama run llama3.1:70b` |
| Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` |
@@ -101,16 +99,16 @@ See the [guide](docs/import.md) on importing models for more information.
### Customize a prompt
Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3.2` model:
Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3.1` model:
```
ollama pull llama3.2
ollama pull llama3.1
```
Create a `Modelfile`:
```
FROM llama3.2
FROM llama3.1
# set the temperature to 1 [higher is more creative, lower is more coherent]
PARAMETER temperature 1
@@ -145,7 +143,7 @@ ollama create mymodel -f ./Modelfile
### Pull a model
```
ollama pull llama3.2
ollama pull llama3.1
```
> This command can also be used to update a local model. Only the diff will be pulled.
@@ -153,13 +151,13 @@ ollama pull llama3.2
### Remove a model
```
ollama rm llama3.2
ollama rm llama3.1
```
### Copy a model
```
ollama cp llama3.2 my-model
ollama cp llama3.1 my-model
```
### Multiline input
@@ -183,14 +181,14 @@ The image features a yellow smiley face, which is likely the central focus of th
### Pass the prompt as an argument
```
$ ollama run llama3.2 "Summarize this file: $(cat README.md)"
$ ollama run llama3.1 "Summarize this file: $(cat README.md)"
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
```
### Show model information
```
ollama show llama3.2
ollama show llama3.1
```
### List models on your computer
@@ -199,18 +197,6 @@ ollama show llama3.2
ollama list
```
### List which models are currently loaded
```
ollama ps
```
### Stop a model which is currently running
```
ollama stop llama3.2
```
### Start Ollama
`ollama serve` is used when you want to start ollama without running the desktop application.
@@ -230,7 +216,7 @@ Next, start the server:
Finally, in a separate shell, run a model:
```
./ollama run llama3.2
./ollama run llama3.1
```
## REST API
@@ -241,7 +227,7 @@ Ollama has a REST API for running and managing models.
```
curl http://localhost:11434/api/generate -d '{
"model": "llama3.2",
"model": "llama3.1",
"prompt":"Why is the sky blue?"
}'
```
@@ -250,7 +236,7 @@ curl http://localhost:11434/api/generate -d '{
```
curl http://localhost:11434/api/chat -d '{
"model": "llama3.2",
"model": "llama3.1",
"messages": [
{ "role": "user", "content": "why is the sky blue?" }
]
@@ -309,25 +295,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
- [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
- [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
- [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
- [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
- [AI Studio](https://github.com/MindWorkAI/AI-Studio)
- [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
- [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
- [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
- [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
- [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
- [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
- [Claude Dev](https://github.com/saoudrizwan/claude-dev) - VSCode extension for multi-file/whole-repo coding
- [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
- [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
- [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
- [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
### Terminal
@@ -352,11 +326,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [podman-ollama](https://github.com/ericcurtin/podman-ollama)
- [gollama](https://github.com/sammcj/gollama)
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
- [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
- [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
### Apple Vision Pro
- [Enchanted](https://github.com/AugustDev/enchanted)
### Database
@@ -366,28 +335,23 @@ See the [API documentation](./docs/api.md) for all endpoints.
### Package managers
- [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
- [Gentoo](https://github.com/gentoo/guru/tree/master/app-misc/ollama)
- [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
- [Guix channel](https://codeberg.org/tusharhero/ollama-guix)
- [Nix package](https://search.nixos.org/packages?channel=24.05&show=ollama&from=0&size=50&sort=relevance&type=packages&query=ollama)
- [Flox](https://flox.dev/blog/ollama-part-one)
### Libraries
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
- [crewAI](https://github.com/crewAIInc/crewAI)
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
- [LlamaIndex](https://docs.llamaindex.ai/en/stable/examples/llm/ollama/) and [LlamaIndexTS](https://ts.llamaindex.ai/modules/llms/available_llms/ollama)
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
- [LiteLLM](https://github.com/BerriAI/litellm)
- [OllamaFarm for Go](https://github.com/presbrey/ollamafarm)
- [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
- [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
- [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
- [Ollama-hpp for C++](https://github.com/jmont-dev/ollama-hpp)
- [Ollama4j for Java](https://github.com/ollama4j/ollama4j)
- [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j)
- [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
- [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit)
- [Ollama for Dart](https://github.com/breitburg/dart-ollama)
@@ -404,17 +368,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
- [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
- [LlamaScript](https://github.com/Project-Llama/llamascript)
- [Gollm](https://docs.gollm.co/examples/ollama-example)
- [Ollamaclient for Golang](https://github.com/xyproto/ollamaclient)
- [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun)
- [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
- [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
### Mobile
- [Enchanted](https://github.com/AugustDev/enchanted)
- [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
### Extensions & Plugins
@@ -439,16 +397,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
- [Plasmoid Ollama Control](https://github.com/imoize/plasmoid-ollamacontrol) (KDE Plasma extension that allows you to quickly manage/control Ollama model)
- [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
- [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
- [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
### Supported backends

View File

@@ -296,17 +296,15 @@ type EmbeddingResponse struct {
// CreateRequest is the request passed to [Client.Create].
type CreateRequest struct {
Model string `json:"model"`
Path string `json:"path"`
Modelfile string `json:"modelfile"`
Stream *bool `json:"stream,omitempty"`
Quantize string `json:"quantize,omitempty"`
// Deprecated: set the model name with Model instead
// Name is deprecated, see Model
Name string `json:"name"`
// Deprecated: set the file content with Modelfile instead
Path string `json:"path"`
// Deprecated: use Quantize instead
// Quantization is deprecated, see Quantize
Quantization string `json:"quantization,omitempty"`
}
@@ -314,7 +312,7 @@ type CreateRequest struct {
type DeleteRequest struct {
Model string `json:"model"`
// Deprecated: set the model name with Model instead
// Name is deprecated, see Model
Name string `json:"name"`
}
@@ -329,7 +327,7 @@ type ShowRequest struct {
Options map[string]interface{} `json:"options"`
// Deprecated: set the model name with Model instead
// Name is deprecated, see Model
Name string `json:"name"`
}
@@ -361,7 +359,7 @@ type PullRequest struct {
Password string `json:"password"`
Stream *bool `json:"stream,omitempty"`
// Deprecated: set the model name with Model instead
// Name is deprecated, see Model
Name string `json:"name"`
}
@@ -382,7 +380,7 @@ type PushRequest struct {
Password string `json:"password"`
Stream *bool `json:"stream,omitempty"`
// Deprecated: set the model name with Model instead
// Name is deprecated, see Model
Name string `json:"name"`
}

View File

@@ -28,8 +28,8 @@ AppPublisher={#MyAppPublisher}
AppPublisherURL={#MyAppURL}
AppSupportURL={#MyAppURL}
AppUpdatesURL={#MyAppURL}
ArchitecturesAllowed=x64compatible arm64
ArchitecturesInstallIn64BitMode=x64compatible arm64
ArchitecturesAllowed=x64 arm64
ArchitecturesInstallIn64BitMode=x64 arm64
DefaultDirName={localappdata}\Programs\{#MyAppName}
DefaultGroupName={#MyAppName}
DisableProgramGroupPage=yes
@@ -48,7 +48,6 @@ OutputDir=..\dist\
SetupLogging=yes
CloseApplications=yes
RestartApplications=no
RestartIfNeededByRun=no
; https://jrsoftware.org/ishelp/index.php?topic=setup_wizardimagefile
WizardSmallImageFile=.\assets\setup.bmp
@@ -87,21 +86,12 @@ Name: "english"; MessagesFile: "compiler:Default.isl"
DialogFontSize=12
[Files]
#if DirExists("..\dist\windows-amd64")
Source: "..\dist\windows-amd64-app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ;Check: not IsArm64(); Flags: ignoreversion 64bit
Source: "..\dist\windows-amd64\ollama.exe"; DestDir: "{app}"; Check: not IsArm64(); Flags: ignoreversion 64bit
Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Check: not IsArm64(); Flags: ignoreversion 64bit recursesubdirs
#endif
#if DirExists("..\dist\windows-arm64")
Source: "..\dist\windows-arm64\vc_redist.arm64.exe"; DestDir: "{tmp}"; Check: IsArm64() and vc_redist_needed(); Flags: deleteafterinstall
Source: "..\dist\windows-arm64-app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ;Check: IsArm64(); Flags: ignoreversion 64bit
Source: "..\dist\windows-arm64\ollama.exe"; DestDir: "{app}"; Check: IsArm64(); Flags: ignoreversion 64bit
Source: "..\dist\windows-arm64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Check: IsArm64(); Flags: ignoreversion 64bit recursesubdirs
#endif
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
Source: "..\ollama.exe"; DestDir: "{app}\bin"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs
[Icons]
Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
@@ -109,10 +99,7 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen
Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
[Run]
#if DirExists("..\dist\windows-arm64")
Filename: "{tmp}\vc_redist.arm64.exe"; Parameters: "/install /passive /norestart"; Check: IsArm64() and vc_redist_needed(); StatusMsg: "Installing VC++ Redistributables..."; Flags: waituntilterminated
#endif
Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
Filename: "{cmd}"; Parameters: "/C set PATH={app}\bin;%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
[UninstallRun]
; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden
@@ -142,13 +129,13 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
;FinishedHeadingLabel=Run your first model
;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama3.2
;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama3.1
;ClickFinish=%n
[Registry]
Root: HKCU; Subkey: "Environment"; \
ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \
Check: NeedsAddPath('{app}')
ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}\bin"; \
Check: NeedsAddPath('{app}\bin')
[Code]
@@ -167,39 +154,3 @@ begin
{ Pos() returns 0 if not found }
Result := Pos(';' + ExpandConstant(Param) + ';', ';' + OrigPath + ';') = 0;
end;
{ --- VC Runtime libraries discovery code - Only install vc_redist if it isn't already installed ----- }
const VCRTL_MIN_V1 = 14;
const VCRTL_MIN_V2 = 40;
const VCRTL_MIN_V3 = 33807;
const VCRTL_MIN_V4 = 0;
// check if the minimum required vc redist is installed (by looking the registry)
function vc_redist_needed (): Boolean;
var
sRegKey: string;
v1: Cardinal;
v2: Cardinal;
v3: Cardinal;
v4: Cardinal;
begin
sRegKey := 'SOFTWARE\WOW6432Node\Microsoft\VisualStudio\14.0\VC\Runtimes\arm64';
if (RegQueryDWordValue (HKEY_LOCAL_MACHINE, sRegKey, 'Major', v1) and
RegQueryDWordValue (HKEY_LOCAL_MACHINE, sRegKey, 'Minor', v2) and
RegQueryDWordValue (HKEY_LOCAL_MACHINE, sRegKey, 'Bld', v3) and
RegQueryDWordValue (HKEY_LOCAL_MACHINE, sRegKey, 'RBld', v4)) then
begin
Log ('VC Redist version: ' + IntToStr (v1) +
'.' + IntToStr (v2) + '.' + IntToStr (v3) +
'.' + IntToStr (v4));
{ Version info was found. Return true if later or equal to our
minimal required version RTL_MIN_Vx }
Result := not (
(v1 > VCRTL_MIN_V1) or ((v1 = VCRTL_MIN_V1) and
((v2 > VCRTL_MIN_V2) or ((v2 = VCRTL_MIN_V2) and
((v3 > VCRTL_MIN_V3) or ((v3 = VCRTL_MIN_V3) and
(v4 >= VCRTL_MIN_V4)))))));
end
else
Result := TRUE;
end;

View File

@@ -4,5 +4,5 @@ write-host "Welcome to Ollama!"
write-host ""
write-host "Run your first model:"
write-host ""
write-host "`tollama run llama3.2"
write-host "`tollama run llama3.1"
write-host ""

View File

@@ -1 +0,0 @@
This is here to make sure the build/ directory exists for the go:embed command

View File

@@ -1 +0,0 @@
This is here to make sure the build/ directory exists for the go:embed command

View File

@@ -1,8 +0,0 @@
package build
import "embed"
// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
//go:embed darwin/amd64/*
var EmbedFS embed.FS

View File

@@ -1,8 +0,0 @@
package build
import "embed"
// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
//go:embed darwin/arm64/*
var EmbedFS embed.FS

View File

@@ -1,6 +0,0 @@
package build
import "embed"
//go:embed linux/*
var EmbedFS embed.FS

View File

@@ -1,8 +0,0 @@
//go:build !linux && !darwin
package build
import "embed"
// unused on windows
var EmbedFS embed.FS

View File

@@ -1 +0,0 @@
This is here to make sure the build/ directory exists for the go:embed command

View File

@@ -1 +0,0 @@
This is here to make sure the build/ directory exists for the go:embed command

View File

@@ -2,7 +2,6 @@ package cmd
import (
"archive/zip"
"bufio"
"bytes"
"context"
"crypto/ed25519"
@@ -22,7 +21,6 @@ import (
"regexp"
"runtime"
"slices"
"strconv"
"strings"
"sync/atomic"
"syscall"
@@ -346,39 +344,6 @@ func (w *progressWriter) Write(p []byte) (n int, err error) {
return len(p), nil
}
func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
p := progress.NewProgress(os.Stderr)
defer p.StopAndClear()
spinner := progress.NewSpinner("")
p.Add("", spinner)
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
req := &api.GenerateRequest{
Model: opts.Model,
KeepAlive: opts.KeepAlive,
}
return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
}
func StopHandler(cmd *cobra.Command, args []string) error {
opts := &runOptions{
Model: args[0],
KeepAlive: &api.Duration{Duration: 0},
}
if err := loadOrUnloadModel(cmd, opts); err != nil {
if strings.Contains(err.Error(), "not found") {
return fmt.Errorf("couldn't find model \"%s\" to stop", args[0])
}
}
return nil
}
func RunHandler(cmd *cobra.Command, args []string) error {
interactive := true
@@ -457,7 +422,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
opts.ParentModel = info.Details.ParentModel
if interactive {
if err := loadOrUnloadModel(cmd, &opts); err != nil {
if err := loadModel(cmd, &opts); err != nil {
return err
}
@@ -613,7 +578,7 @@ func ListHandler(cmd *cobra.Command, args []string) error {
table.SetHeaderLine(false)
table.SetBorder(false)
table.SetNoWhiteSpace(true)
table.SetTablePadding(" ")
table.SetTablePadding("\t")
table.AppendBulk(data)
table.Render()
@@ -648,15 +613,7 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent))
}
var until string
delta := time.Since(m.ExpiresAt)
if delta > 0 {
until = "Stopping..."
} else {
until = format.HumanTime(m.ExpiresAt, "Never")
}
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, until})
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, format.HumanTime(m.ExpiresAt, "Never")})
}
}
@@ -667,7 +624,7 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
table.SetHeaderLine(false)
table.SetBorder(false)
table.SetNoWhiteSpace(true)
table.SetTablePadding(" ")
table.SetTablePadding("\t")
table.AppendBulk(data)
table.Render()
@@ -763,89 +720,122 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
return nil
}
return showInfo(resp, os.Stdout)
showInfo(resp)
return nil
}
func showInfo(resp *api.ShowResponse, w io.Writer) error {
tableRender := func(header string, rows func() [][]string) {
fmt.Fprintln(w, " ", header)
table := tablewriter.NewWriter(w)
table.SetAlignment(tablewriter.ALIGN_LEFT)
table.SetBorder(false)
table.SetNoWhiteSpace(true)
table.SetTablePadding(" ")
func showInfo(resp *api.ShowResponse) {
arch := resp.ModelInfo["general.architecture"].(string)
switch header {
case "Template", "System", "License":
table.SetColWidth(100)
}
table.AppendBulk(rows())
table.Render()
fmt.Fprintln(w)
modelData := [][]string{
{"arch", arch},
{"parameters", resp.Details.ParameterSize},
{"quantization", resp.Details.QuantizationLevel},
{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
}
tableRender("Model", func() (rows [][]string) {
if resp.ModelInfo != nil {
arch := resp.ModelInfo["general.architecture"].(string)
rows = append(rows, []string{"", "architecture", arch})
rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ModelInfo["general.parameter_count"].(float64)))})
rows = append(rows, []string{"", "context length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64), 'f', -1, 64)})
rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64), 'f', -1, 64)})
} else {
rows = append(rows, []string{"", "architecture", resp.Details.Family})
rows = append(rows, []string{"", "parameters", resp.Details.ParameterSize})
}
rows = append(rows, []string{"", "quantization", resp.Details.QuantizationLevel})
return
})
mainTableData := [][]string{
{"Model"},
{renderSubTable(modelData, false)},
}
if resp.ProjectorInfo != nil {
tableRender("Projector", func() (rows [][]string) {
arch := resp.ProjectorInfo["general.architecture"].(string)
rows = append(rows, []string{"", "architecture", arch})
rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))})
rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.embedding_length", arch)].(float64), 'f', -1, 64)})
rows = append(rows, []string{"", "dimensions", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.projection_dim", arch)].(float64), 'f', -1, 64)})
return
})
projectorData := [][]string{
{"arch", "clip"},
{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
}
if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok {
projectorData = append(projectorData, []string{"projector type", projectorType.(string)})
}
projectorData = append(projectorData,
[]string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
[]string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
)
mainTableData = append(mainTableData,
[]string{"Projector"},
[]string{renderSubTable(projectorData, false)},
)
}
if resp.Parameters != "" {
tableRender("Parameters", func() (rows [][]string) {
scanner := bufio.NewScanner(strings.NewReader(resp.Parameters))
for scanner.Scan() {
if text := scanner.Text(); text != "" {
rows = append(rows, append([]string{""}, strings.Fields(text)...))
}
}
return
})
}
head := func(s string, n int) (rows [][]string) {
scanner := bufio.NewScanner(strings.NewReader(s))
for scanner.Scan() && (len(rows) < n || n < 0) {
if text := scanner.Text(); text != "" {
rows = append(rows, []string{"", strings.TrimSpace(text)})
}
}
return
mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)})
}
if resp.System != "" {
tableRender("System", func() [][]string {
return head(resp.System, 2)
})
mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)})
}
if resp.License != "" {
tableRender("License", func() [][]string {
return head(resp.License, 2)
})
mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)})
}
return nil
table := tablewriter.NewWriter(os.Stdout)
table.SetAutoWrapText(false)
table.SetBorder(false)
table.SetAlignment(tablewriter.ALIGN_LEFT)
for _, v := range mainTableData {
table.Append(v)
}
table.Render()
}
func renderSubTable(data [][]string, file bool) string {
var buf bytes.Buffer
table := tablewriter.NewWriter(&buf)
table.SetAutoWrapText(!file)
table.SetBorder(false)
table.SetNoWhiteSpace(true)
table.SetTablePadding("\t")
table.SetAlignment(tablewriter.ALIGN_LEFT)
for _, v := range data {
table.Append(v)
}
table.Render()
renderedTable := buf.String()
lines := strings.Split(renderedTable, "\n")
for i, line := range lines {
lines[i] = "\t" + line
}
return strings.Join(lines, "\n")
}
func twoLines(s string) [][]string {
lines := strings.Split(s, "\n")
res := [][]string{}
count := 0
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" {
count++
res = append(res, []string{line})
if count == 2 {
return res
}
}
}
return res
}
func formatParams(s string) string {
lines := strings.Split(s, "\n")
table := [][]string{}
for _, line := range lines {
table = append(table, strings.Fields(line))
}
return renderSubTable(table, false)
}
func CopyHandler(cmd *cobra.Command, args []string) error {
@@ -1335,15 +1325,6 @@ func NewCLI() *cobra.Command {
runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
runCmd.Flags().String("format", "", "Response format (e.g. json)")
stopCmd := &cobra.Command{
Use: "stop MODEL",
Short: "Stop a running model",
Args: cobra.ExactArgs(1),
PreRunE: checkServerHeartbeat,
RunE: StopHandler,
}
serveCmd := &cobra.Command{
Use: "serve",
Aliases: []string{"start"},
@@ -1411,7 +1392,6 @@ func NewCLI() *cobra.Command {
createCmd,
showCmd,
runCmd,
stopCmd,
pullCmd,
pushCmd,
listCmd,
@@ -1438,8 +1418,6 @@ func NewCLI() *cobra.Command {
envVars["OLLAMA_TMPDIR"],
envVars["OLLAMA_FLASH_ATTENTION"],
envVars["OLLAMA_LLM_LIBRARY"],
envVars["OLLAMA_GPU_OVERHEAD"],
envVars["OLLAMA_LOAD_TIMEOUT"],
})
default:
appendEnvDocs(cmd, envs)
@@ -1451,7 +1429,6 @@ func NewCLI() *cobra.Command {
createCmd,
showCmd,
runCmd,
stopCmd,
pullCmd,
pushCmd,
listCmd,

View File

@@ -1,206 +0,0 @@
package cmd
import (
"bytes"
"os"
"path/filepath"
"testing"
"github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api"
)
func TestShowInfo(t *testing.T) {
t.Run("bare details", func(t *testing.T) {
var b bytes.Buffer
if err := showInfo(&api.ShowResponse{
Details: api.ModelDetails{
Family: "test",
ParameterSize: "7B",
QuantizationLevel: "FP16",
},
}, &b); err != nil {
t.Fatal(err)
}
expect := ` Model
architecture test
parameters 7B
quantization FP16
`
if diff := cmp.Diff(expect, b.String()); diff != "" {
t.Errorf("unexpected output (-want +got):\n%s", diff)
}
})
t.Run("bare model info", func(t *testing.T) {
var b bytes.Buffer
if err := showInfo(&api.ShowResponse{
ModelInfo: map[string]any{
"general.architecture": "test",
"general.parameter_count": float64(7_000_000_000),
"test.context_length": float64(0),
"test.embedding_length": float64(0),
},
Details: api.ModelDetails{
Family: "test",
ParameterSize: "7B",
QuantizationLevel: "FP16",
},
}, &b); err != nil {
t.Fatal(err)
}
expect := ` Model
architecture test
parameters 7B
context length 0
embedding length 0
quantization FP16
`
if diff := cmp.Diff(expect, b.String()); diff != "" {
t.Errorf("unexpected output (-want +got):\n%s", diff)
}
})
t.Run("parameters", func(t *testing.T) {
var b bytes.Buffer
if err := showInfo(&api.ShowResponse{
Details: api.ModelDetails{
Family: "test",
ParameterSize: "7B",
QuantizationLevel: "FP16",
},
Parameters: `
stop never
stop gonna
stop give
stop you
stop up
temperature 99`,
}, &b); err != nil {
t.Fatal(err)
}
expect := ` Model
architecture test
parameters 7B
quantization FP16
Parameters
stop never
stop gonna
stop give
stop you
stop up
temperature 99
`
if diff := cmp.Diff(expect, b.String()); diff != "" {
t.Errorf("unexpected output (-want +got):\n%s", diff)
}
})
t.Run("project info", func(t *testing.T) {
var b bytes.Buffer
if err := showInfo(&api.ShowResponse{
Details: api.ModelDetails{
Family: "test",
ParameterSize: "7B",
QuantizationLevel: "FP16",
},
ProjectorInfo: map[string]any{
"general.architecture": "clip",
"general.parameter_count": float64(133_700_000),
"clip.vision.embedding_length": float64(0),
"clip.vision.projection_dim": float64(0),
},
}, &b); err != nil {
t.Fatal(err)
}
expect := ` Model
architecture test
parameters 7B
quantization FP16
Projector
architecture clip
parameters 133.70M
embedding length 0
dimensions 0
`
if diff := cmp.Diff(expect, b.String()); diff != "" {
t.Errorf("unexpected output (-want +got):\n%s", diff)
}
})
t.Run("system", func(t *testing.T) {
var b bytes.Buffer
if err := showInfo(&api.ShowResponse{
Details: api.ModelDetails{
Family: "test",
ParameterSize: "7B",
QuantizationLevel: "FP16",
},
System: `You are a pirate!
Ahoy, matey!
Weigh anchor!
`,
}, &b); err != nil {
t.Fatal(err)
}
expect := ` Model
architecture test
parameters 7B
quantization FP16
System
You are a pirate!
Ahoy, matey!
`
if diff := cmp.Diff(expect, b.String()); diff != "" {
t.Errorf("unexpected output (-want +got):\n%s", diff)
}
})
t.Run("license", func(t *testing.T) {
var b bytes.Buffer
license, err := os.ReadFile(filepath.Join("..", "LICENSE"))
if err != nil {
t.Fatal(err)
}
if err := showInfo(&api.ShowResponse{
Details: api.ModelDetails{
Family: "test",
ParameterSize: "7B",
QuantizationLevel: "FP16",
},
License: string(license),
}, &b); err != nil {
t.Fatal(err)
}
expect := ` Model
architecture test
parameters 7B
quantization FP16
License
MIT License
Copyright (c) Ollama
`
if diff := cmp.Diff(expect, b.String()); diff != "" {
t.Errorf("unexpected output (-want +got):\n%s", diff)
}
})
}

View File

@@ -18,6 +18,7 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/progress"
"github.com/ollama/ollama/readline"
"github.com/ollama/ollama/types/errtypes"
)
@@ -30,6 +31,26 @@ const (
MultilineSystem
)
func loadModel(cmd *cobra.Command, opts *runOptions) error {
p := progress.NewProgress(os.Stderr)
defer p.StopAndClear()
spinner := progress.NewSpinner("")
p.Add("", spinner)
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
chatReq := &api.ChatRequest{
Model: opts.Model,
KeepAlive: opts.KeepAlive,
}
return client.Chat(cmd.Context(), chatReq, func(api.ChatResponse) error { return nil })
}
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
usage := func() {
fmt.Fprintln(os.Stderr, "Available Commands:")
@@ -196,7 +217,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
opts.Model = args[1]
opts.Messages = []api.Message{}
fmt.Printf("Loading model '%s'\n", opts.Model)
if err := loadOrUnloadModel(cmd, &opts); err != nil {
if err := loadModel(cmd, &opts); err != nil {
return err
}
continue
@@ -350,7 +371,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
switch args[1] {
case "info":
_ = showInfo(resp, os.Stderr)
showInfo(resp)
case "license":
if resp.License == "" {
fmt.Println("No license was specified for this model.")

View File

@@ -208,18 +208,14 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
return err
}
vocabSize := int(p.VocabSize)
switch {
case vocabSize > len(t.Vocabulary.Tokens):
slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
if vocabSize := int(p.VocabSize); vocabSize > len(t.Vocabulary.Tokens) {
slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", p.VocabSize, "actual", len(t.Vocabulary.Tokens))
for i := range vocabSize - len(t.Vocabulary.Tokens) {
t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
}
case vocabSize < len(t.Vocabulary.Tokens):
return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
default:
} else {
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
}

View File

@@ -34,20 +34,10 @@ func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
}
func (p *gemma2Model) Replacements() []string {
return []string{
"model.embed_tokens", "token_embd",
"model.norm", "output_norm",
"model.layers", "blk",
"input_layernorm", "attn_norm",
"self_attn.q_proj", "attn_q",
"self_attn.k_proj", "attn_k",
"self_attn.v_proj", "attn_v",
"self_attn.o_proj", "attn_output",
"mlp.gate_proj", "ffn_gate",
"mlp.down_proj", "ffn_down",
"mlp.up_proj", "ffn_up",
return append(
p.gemmaModel.Replacements(),
"post_attention_layernorm", "post_attention_norm",
"pre_feedforward_layernorm", "ffn_norm",
"post_feedforward_layernorm", "post_ffw_norm",
}
)
}

View File

@@ -15,7 +15,6 @@ import (
"os"
"path/filepath"
"slices"
"strings"
"testing"
"golang.org/x/exp/maps"
@@ -23,12 +22,6 @@ import (
"github.com/ollama/ollama/llm"
)
type tensorData struct {
Offsets []int `json:"data_offsets"`
Type string `json:"dtype"`
Shape []int `json:"shape"`
}
func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
t.Helper()
@@ -96,14 +89,13 @@ func TestMain(m *testing.M) {
os.Exit(m.Run())
}
func TestConvertModel(t *testing.T) {
func TestConvertFull(t *testing.T) {
cases := []string{
"Meta-Llama-3-8B-Instruct",
"Meta-Llama-3.1-8B-Instruct",
"Mistral-7B-Instruct-v0.2",
"Mixtral-8x7B-Instruct-v0.1",
"gemma-2b-it",
"gemma-2-2b-it",
// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
"Phi-3-mini-128k-instruct",
"all-MiniLM-L6-v2",
@@ -148,132 +140,6 @@ func TestConvertModel(t *testing.T) {
}
}
func TestConvertInvalidTensorNames(t *testing.T) {
f, err := os.CreateTemp(t.TempDir(), "testmodel")
if err != nil {
t.Fatal(err)
}
defer f.Close()
tempDir := t.TempDir()
td := map[string]*tensorData{}
offset := 4096
td["model.layers.0.self_attn.q_proj.weight"] = &tensorData{
Offsets: []int{0, offset},
Type: "F32",
Shape: []int{4096, 4096},
}
td["blk.0.attn_q.weight"] = &tensorData{
Offsets: []int{offset, offset * 2},
Type: "F32",
Shape: []int{4096, 4096},
}
generateSafetensorTestData(t, tempDir, td)
err = ConvertModel(os.DirFS(tempDir), f)
if err == nil || !strings.HasPrefix(err.Error(), "duplicate tensor name") {
t.Errorf("expected error but didn't get one")
}
}
func TestConvertInvalidDatatype(t *testing.T) {
f, err := os.CreateTemp(t.TempDir(), "testmodel")
if err != nil {
t.Fatal(err)
}
defer f.Close()
tempDir := t.TempDir()
td := map[string]*tensorData{}
offset := 4096 * 14336
td["model.layers.0.mlp.down_proj.weight"] = &tensorData{
Offsets: []int{0, offset},
Type: "I8",
Shape: []int{4096, 14336},
}
td["model.layers.0.mlp.down_proj.weight_format"] = &tensorData{
Offsets: []int{offset, offset},
Type: "U8",
Shape: []int{},
}
generateSafetensorTestData(t, tempDir, td)
err = ConvertModel(os.DirFS(tempDir), f)
if err == nil || err.Error() != "unsupported safetensors model" {
t.Errorf("expected error but didn't get one")
}
}
func generateSafetensorTestData(t *testing.T, tempDir string, tensorData map[string]*tensorData) {
data, err := json.Marshal(tensorData)
if err != nil {
t.Fatal(err)
}
var buf bytes.Buffer
l := int64(len(data))
err = binary.Write(&buf, binary.LittleEndian, l)
if err != nil {
t.Fatal(err)
}
_, err = buf.Write(data)
if err != nil {
t.Fatal(err)
}
fdata, err := os.Create(filepath.Join(tempDir, "model-00001-of-00001.safetensors"))
if err != nil {
t.Fatal(err)
}
defer fdata.Close()
_, err = fdata.Write(buf.Bytes())
if err != nil {
t.Fatal(err)
}
configData := `
{
"architectures": [
"LlamaForCausalLM"
]
}
`
f, err := os.Create(filepath.Join(tempDir, "config.json"))
if err != nil {
t.Fatal(err)
}
defer f.Close()
_, err = f.WriteString(configData)
if err != nil {
t.Fatal(err)
}
tokenizerData := `
{
}
`
f, err = os.Create(filepath.Join(tempDir, "tokenizer.json"))
if err != nil {
t.Fatal(err)
}
defer f.Close()
_, err = f.WriteString(tokenizerData)
if err != nil {
t.Fatal(err)
}
}
func TestConvertAdapter(t *testing.T) {
type AdapterCase struct {
Name string
@@ -355,6 +221,11 @@ func TestConvertAdapter(t *testing.T) {
}
func generateLoraTestData(t *testing.T, tempDir string) {
type tensorData struct {
Offsets []int `json:"data_offsets"`
Type string `json:"dtype"`
Shape []int `json:"shape"`
}
offset := 4096 * 8 * 4
td := map[string]*tensorData{"__metadata__": nil}

View File

@@ -4,7 +4,6 @@ import (
"bytes"
"encoding/binary"
"encoding/json"
"errors"
"fmt"
"io"
"io/fs"
@@ -49,19 +48,8 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
keys := maps.Keys(headers)
slices.Sort(keys)
names := make(map[string]struct{}, len(keys))
for _, key := range keys {
if value := headers[key]; value.Type != "" {
// bitsandbytes quantized models are unsupported
if len(value.Shape) == 0 {
return nil, errors.New("unsupported safetensors model")
}
ggufName := replacer.Replace(key)
if _, ok := names[ggufName]; ok {
return nil, fmt.Errorf("duplicate tensor name '%s' was found for this model", ggufName)
}
names[ggufName] = struct{}{}
ts = append(ts, safetensor{
fs: fsys,
path: p,
@@ -69,7 +57,7 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
offset: safetensorsPad(n, value.Offsets[0]),
size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
tensorBase: &tensorBase{
name: ggufName,
name: replacer.Replace(key),
shape: value.Shape,
},
})

View File

@@ -1,312 +0,0 @@
{
"general.architecture": "gemma2",
"general.file_type": "1",
"general.quantization_version": "2",
"gemma2.block_count": "26",
"gemma2.context_length": "8192",
"gemma2.embedding_length": "2304",
"gemma2.feed_forward_length": "9216",
"gemma2.attention.head_count": "8",
"gemma2.attention.head_count_kv": "4",
"gemma2.attention.key_length": "256",
"gemma2.attention.value_length": "256",
"gemma2.attention.layer_norm_rms_epsilon": "1e-06",
"tokenizer.ggml.model": "llama",
"tokenizer.ggml.add_bos_token": "true",
"tokenizer.ggml.add_eos_token": "false",
"tokenizer.ggml.bos_token_id": "2",
"tokenizer.ggml.eos_token_id": "1",
"tokenizer.ggml.padding_token_id": "0",
"tokenizer.ggml.unknown_token_id": "3",
"tokenizer.ggml.scores": "0872465d173867d755d3ee728f882b9dc2057a0bfd596fe1e3d131522f1250d8",
"tokenizer.ggml.token_type": "8d40143b3477df77beea4139420335ede458bf5e14102f01b0170197b55da8d8",
"tokenizer.ggml.tokens": "c6e66de1841f04de8b8d236d461ab720a4c9b9b5414dc293a09c6e10eab45fda",
"token_embd.weight": "64a9d30707e659e2e673656d71f5aef7a9fb9fd83bb9a77558dfc5abbe218a05",
"blk.0.attn_k.weight": "d8b4437c5edb3cddf6af9987038e1bb2b191c4f0fce0e160d2abace717f5d5d7",
"blk.0.attn_norm.weight": "1eb73e3f7aa8e502f6ca31cd19efbb8e4fd9a89692e13e48ac8205545a7fa7e8",
"blk.0.attn_output.weight": "39e7b78e57d356a22dd89ce1c4d7163b970712ba756545e1703f97866cd2192e",
"blk.0.attn_q.weight": "795058e23b6109febd9d55c89e1eebe6af0714ec8c56fd86a160876a6135ffe8",
"blk.0.attn_v.weight": "0cd6e583d1887c020472e961bbb113fe5a0d23ae2f1c2c876fc366cdb7692b52",
"blk.0.ffn_down.weight": "51eb4d962189e945a84e94e0dc1aad3f8f90cc1a11e18029670afcd0ea0acb1b",
"blk.0.ffn_gate.weight": "9811a29b8ad48432925897ab21dfcb13c5cbd372aeccbbefca9b7866883b4ce3",
"blk.0.ffn_norm.weight": "92cbf4652ef503c1de5b10f2be00b3fcf00100980cb3baa8f3013a8d8bf3d851",
"blk.0.ffn_up.weight": "af87de21746879483ed1b374cdd76b19ba11ca2b6dbb1beba98efdf3be3e8077",
"blk.0.post_attention_norm.weight": "32e135f1f258ffe407018899e39af1725d59d66d60022b9a21575ba160e0357a",
"blk.0.post_ffw_norm.weight": "ba286f5ac11b07fbc986173708c66f1920427be5a6d108af38fa0a837c1c8eb6",
"blk.1.attn_k.weight": "51584435552051f7fade76beca582b3f7190cf7fc07adcf527c2774d4b1c3901",
"blk.1.attn_norm.weight": "6833104c7fbf35a7e799ae56c262b97fffa14789642aee14381b25acd21ed80a",
"blk.1.attn_output.weight": "14c39481369087bf292ac9a3ab2ef166f9fe376a9f90c246653213ef264febdc",
"blk.1.attn_q.weight": "443f64ae2229f857c69d6bebb7800b685786cb77884c3ae19d4286aeed081325",
"blk.1.attn_v.weight": "0df482de2038f1e4c8a7733ac0ddb69ad90759dab5968b942af0155588de4c4a",
"blk.1.ffn_down.weight": "66f30763a8bbbcaea609a0087ed75fadb5e771c06378dd2cea94cf17e492e8cf",
"blk.1.ffn_gate.weight": "a7151bff00a545fa18b2c92dcd2a14572ccf9beb957a6c494f1374e8ebe174c9",
"blk.1.ffn_norm.weight": "e197d71ea11b5276bc0167d2663b88089b3ff42b47ba91e85f6c5d95f6306435",
"blk.1.ffn_up.weight": "57c182e0b14cccd1350d388f0c616991702e74281db54637451b70f4ccc24f9b",
"blk.1.post_attention_norm.weight": "3c56f837168d784c2d8bac247c130bdca6610c095c8da4558c536ccad7605609",
"blk.1.post_ffw_norm.weight": "d2a51d320fd01069dd7ccaa7082f16a7faeb671885607d7900b10a89c354d0fa",
"blk.2.attn_k.weight": "bc103c818192de7ce36caaf89dc117be4df13fb902e0bd9a23c64edace5df9b6",
"blk.2.attn_norm.weight": "0f2503aa126083a5d6ac72481be1ef66c6014705b573682b35bd864e4749a3d5",
"blk.2.attn_output.weight": "05fcd4a1226e482f91803a266f72caca887a93e63c2d2ba5611ab3c68d38743a",
"blk.2.attn_q.weight": "6a10b5c2fd423d1e4c4fd60fa8c154a0159b6b2501ea79cae2ef19f45a674e5e",
"blk.2.attn_v.weight": "3cf891945a1f8ae7cc908a5c6b729ff5b70f4436c5ffdbf245cc0ed4cc19cd1b",
"blk.2.ffn_down.weight": "ea204fd04e0d2fc728a9861a459216bbfec629c152004ba625f52cd8837bd51e",
"blk.2.ffn_gate.weight": "3a3518729f1b8b64a82b8792f33987db5418fdb094be0263c68f146a5c38de54",
"blk.2.ffn_norm.weight": "754ede678b725de41a34b82f0edf7688b5c065be7c0d46df6f7ad9430d986884",
"blk.2.ffn_up.weight": "ffdcb88439f5828ffbd9fc844b03ff91637b790b9838097258cc3ae75935720c",
"blk.2.post_attention_norm.weight": "4b3f53b7ba26e8c36b2dfda3b7e5fc4b1065257cefdea235fc7df9af130ac2fd",
"blk.2.post_ffw_norm.weight": "e550369e26b8485e2b54ad34b34bc98af5494287dcc513c2c39cf1eaa5b89d07",
"blk.3.attn_k.weight": "89f24ea450e37d9e95757651a83205c085d81b354ee9489dd6310a391d8409f3",
"blk.3.attn_norm.weight": "24e2ea662b7cb822b4ca5cd61bc17f2709f406d990ec3b4a0dac1cc112db45cf",
"blk.3.attn_output.weight": "ac4dad69473c6e3fac56669212cadd8c34ecc5973d945972e974d94805334967",
"blk.3.attn_q.weight": "b6a9c9a7d4722b9096631c65de62228dfddca6e26edfe6af7fce01e116ef0f4c",
"blk.3.attn_v.weight": "f272a960a40093942309bc342a379984cbacec2d7bc64428db3f64e6b1887ed4",
"blk.3.ffn_down.weight": "c0188ba50d8228805982029c277fc0e87aa57473b8363037c648f6d006ff828a",
"blk.3.ffn_gate.weight": "a04aec1561ee6c0fbb18c3db49dc62fb533619cf697fd548cbf2279761aaec3b",
"blk.3.ffn_norm.weight": "bc053837d44087ec05eb5d9458357b2a5be787789b19cdbbdc694b57697f99a6",
"blk.3.ffn_up.weight": "b3ce8b274f20796d3b1a7c08ba27a919066f9de89a782faa544c4a8d6bea1382",
"blk.3.post_attention_norm.weight": "9c922dee7a7df5667289e2788e60170238239cee2dfdbbd9e435763f9f416718",
"blk.3.post_ffw_norm.weight": "b682544ac953ad2e0b49027ed8916f2e9d1aba5d1587bb4127ac703570c7a03a",
"blk.4.attn_k.weight": "143b0cbb4b787b95c2b6212374410e32173ccef2adb914908a2f89a7916de512",
"blk.4.attn_norm.weight": "5668f60491b780273745192662d02c9a92a4f692b29d16aa0bbc7413fec4f85b",
"blk.4.attn_output.weight": "b9f2bdb68be1e0cf66dd19f8fa2afb105910ad2ef394864cb32cea8f8944e0d5",
"blk.4.attn_q.weight": "ddcf1343dafbc2dfcd0b8741225af22fe4b54b2becce29240bd01c34265d126c",
"blk.4.attn_v.weight": "6dc7074366e7ed52d9f48c594dcc85bef738e096276cb99d28228c89eecc5b9c",
"blk.4.ffn_down.weight": "30334ffc59ce343cf2a1b973174acb7722823463adc07e19a99bd0f404bc9906",
"blk.4.ffn_gate.weight": "890f7c8af208d63b28db52c4b8c16c2288a382d87ff5a6a6d6b0a5b3bf27e6cd",
"blk.4.ffn_norm.weight": "ff0316cc7847221eb86a90c1ab441d4ee61553d410c66414a7755021b3b12448",
"blk.4.ffn_up.weight": "6af97d113f91564c636734f215e25ee602d48eb045458f300b3ec7582be0f41d",
"blk.4.post_attention_norm.weight": "69438f231e105e68216b078bdeb35a7cdc8b12c4e2845e18ecf4c8d361d6a321",
"blk.4.post_ffw_norm.weight": "0fd535da78bcf2b32c95b05b2b83dc49817393765be90d8cc1ed3d56f47b68ec",
"blk.5.attn_k.weight": "0166eb3c6d20dcf3d3c169e94caa8dee057535bb525e29f698fb6f8844f18a6c",
"blk.5.attn_norm.weight": "a7808f27f164023d5cde2be00fc23cac6c71aa0ddeb60bc23e12411b80087672",
"blk.5.attn_output.weight": "8b65b2027a0842b68c5308f91d6a31de9599d794157d77df8418b19f9e0d9334",
"blk.5.attn_q.weight": "966bc626ef2c2394d872087a41c126bb1b67d1d5f6de920204ef5e5b16c34003",
"blk.5.attn_v.weight": "9a362aef3f4437fbf0ef6e1ba785f3329c3db2960f93fe36547d2795e9c254ea",
"blk.5.ffn_down.weight": "63e53541d34197720c06f297aa8142ac6b6eec002c7987b296f26e8b1400f931",
"blk.5.ffn_gate.weight": "d9591fdd32f783e0fc26e20d5d587ee8971ac8ae2e4c818c6eac1c125c7c7f37",
"blk.5.ffn_norm.weight": "677334cc60ecce3a7f4ab3acda15d359353d7358872f614ad8914e3780e9fc6e",
"blk.5.ffn_up.weight": "a63764110e1c655ffbd55af0669b2dfe4cc29d0e198d33a8e5426461b08a85f7",
"blk.5.post_attention_norm.weight": "c55499f859b2c0a7f5cabceaae47309a5ad38bc29d0f4a8db81f1357023162a9",
"blk.5.post_ffw_norm.weight": "82752754665f842418f3e302cb5f43d1e0504dcd124c4b8ddb77018b2c793837",
"blk.6.attn_k.weight": "e20a5f0d6c807273c8d491439566b428497ac02097cf0aa55e33748c28e14be6",
"blk.6.attn_norm.weight": "2c6ba42fd3c73d72073ced03a32dd28d70a89ed9bbbc8fea1ba03a7ade951e6c",
"blk.6.attn_output.weight": "4de7c5c2f4a133a266e17ed8c14c52959466b54cc7ab9e19f789a33b4850f284",
"blk.6.attn_q.weight": "56462d921800e6b8cd2213fef04c4ff16d728905cb2f4c58e966d0a053a3b0ae",
"blk.6.attn_v.weight": "b758dcbff769d6240c2245ede1dbc62c4170a67c77458e866312589220fe29af",
"blk.6.ffn_down.weight": "582247fb3c2bf687cbe9413fe18d18ad47bef4b65df7d78905e10335c6134764",
"blk.6.ffn_gate.weight": "3035444d5286aefb7a6d04e55bc27e1fac7cf895cd5be02319a431b8e047b4ae",
"blk.6.ffn_norm.weight": "e582d24c66e01b96faa20ce6adfda3d8583b11e809bff89969927398175e369a",
"blk.6.ffn_up.weight": "6f4b7bbfedeacf61a4866ae0616c4ba6c9e856662e8f00ae6aaec7f52c53e7b4",
"blk.6.post_attention_norm.weight": "8fe51b50bd677d21586aecab0b565c4bf9fa68ad50bfe366f45e8fea3c657ca8",
"blk.6.post_ffw_norm.weight": "81ba3cb4c2bf5c546b86855b7a885d3fafededc67eb3a35cd3598b03c9e26e65",
"blk.7.attn_k.weight": "2e044179cdcae0946708c86bfea7aa0391e1f7e2a09b33fca035d384cc3ca758",
"blk.7.attn_norm.weight": "94b48c546b046803c60e75a3acb17a356b710735989938021b565f68df9b4985",
"blk.7.attn_output.weight": "65709b4ad7a581f4d75793d39d4032a359f6bcc0c3835205242a0b99e5b66824",
"blk.7.attn_q.weight": "8ded993c95d1f7caf201ceb6fa035cd6ed6d351b50b999fa9355dfee9486cb5b",
"blk.7.attn_v.weight": "c92d5e2d2d48397542bc03bea25bf39154075e66c5bb1ead85188505aa04ae91",
"blk.7.ffn_down.weight": "e8ba8fb57208805ef1dc23cd7c86e9a2d1fb7c52c3940d292cd5bb2eb24b3fac",
"blk.7.ffn_gate.weight": "f0f06d6a2e06c5ac252083bc61d05c814e6289d3f4e4a87d2f06918254c02c36",
"blk.7.ffn_norm.weight": "ebf8ef775f72624148e09d68a4332187a7a5020c521fe0623da1cd3485ad33e0",
"blk.7.ffn_up.weight": "a554adc4fc7122c247c77670e169916ba1794c787b5be30a2b36705138f1f746",
"blk.7.post_attention_norm.weight": "3aa6bc21d85c3a0c12b964e82b12feaedfdd13130c3cd2229228e24e0967ebdf",
"blk.7.post_ffw_norm.weight": "508bc7b19ee8ff08f0007c890133a462fc57c7e72b16ee8f6dd64def264ef876",
"blk.8.attn_k.weight": "363c8e74056642fe9e7c2f3f9769d57319cd3fa0a6022810189ab8d894322885",
"blk.8.attn_norm.weight": "685b49a1f1acb169f4df0bdd8e3de6943f3033cebad14b898a72000595610d92",
"blk.8.attn_output.weight": "7bde571e4efef1c6a6143f0526721dfb59e0a0ea0e1a3616a322b2eb937efa48",
"blk.8.attn_q.weight": "fc993dbc1074c28a0e1d85e5ab2f4ea6a9c6c1affe7ee56027000a275daed9b6",
"blk.8.attn_v.weight": "281e8791d3aef9b3864f1cb054da0ae0c2fef4ce0a58b1bad8bc136b2fa0f62b",
"blk.8.ffn_down.weight": "b1164a2578a7f87ed99c2bbc76c5dfbbbc6a1a803605391acc3f320fc989ffd7",
"blk.8.ffn_gate.weight": "6b39a3b3aaaa79aee61416b54d62160b9258042650e61c6b47bc77c2dd17daf3",
"blk.8.ffn_norm.weight": "17ea1362c72da27f12bc936500492035bdef3fd8f940cb12b57f37d42ba8ecb1",
"blk.8.ffn_up.weight": "bc3a7c47afc440d2bdf8fbe9ddf2c9220467472c60c8b4ded8c0f181470ec96c",
"blk.8.post_attention_norm.weight": "5c506204e00411ef9c8b4134d40eedcc19fffe68dd0af7d7cc49dcabf2dfac7e",
"blk.8.post_ffw_norm.weight": "002faec235c3678864e2901eed275ce4e9dc229164a91c9cd4c965142ba62305",
"blk.9.attn_k.weight": "0bab39d8c237f1b6d0010db40467142625a9e6f2e0e4c49a56c12b41e4e0b1fa",
"blk.9.attn_norm.weight": "de5f38e873b17f07aa7598831b89cc1cae2c9bc3eb2e042ee9af059d2563e84e",
"blk.9.attn_output.weight": "8a8184702c25a62df9ff309c0c7badc8587208523b2be3e8fa90ce7080573e6f",
"blk.9.attn_q.weight": "7c961b2431b09ddf95377acd07201cb91bf13d9cd3ae0f2c25c7d6a0358d9f50",
"blk.9.attn_v.weight": "e22d240cb4743067033e659cbf210ebe2ebbab3e1dea6ccbe5eaa982382ca038",
"blk.9.ffn_down.weight": "a426f81210f03d6ad53277416e1fdcdf37d8065e4817613edaf6c67a343426be",
"blk.9.ffn_gate.weight": "a82eba825cb77b8e64f85ff99ede2fc71bc9b01751eeb17e9e6c246ee12ea62e",
"blk.9.ffn_norm.weight": "1a97f9b1302a3a326d534c5c3fed2db6db0ae45fd0edd381a3e4fc1c75d81030",
"blk.9.ffn_up.weight": "5f20bac2bbf03bb42adb92fbf99561651e1edda57e0b61935ac7f6c08c0ed7cb",
"blk.9.post_attention_norm.weight": "9f9866d13988e1946b1e1c80d9374a92a6e3be33748f8eaed3e126d1e1a4c796",
"blk.9.post_ffw_norm.weight": "a6896dbf698db4dbbe5dbf12417d4fd80e9cad0c539c858892ec0aa5b046bb58",
"blk.10.attn_k.weight": "ca8446e5d21ecd4e6a70dca8d321be480be4fba94d70cba065205436feb44270",
"blk.10.attn_norm.weight": "4f41fe290e8f21f63b82151b6cce94bf7318d121468816b0c58af0ff7c1658ab",
"blk.10.attn_output.weight": "c626d2e9681c5c941bbde43dddfae1a8d4986bf2be4470857bc8e8bd7f869044",
"blk.10.attn_q.weight": "1e61b210a13a429977325cf15d781ab77d604cfa862f4270329cbd94237d5835",
"blk.10.attn_v.weight": "8ff8d3e3f058ec3b35ada1057f2ed59c06494d0e0be6a8dc3ff9edf9f0e1a115",
"blk.10.ffn_down.weight": "bcebc04219f8081a5f483e58103c0ddbbbc631a0a54fd6dd9d55778e041f70ee",
"blk.10.ffn_gate.weight": "7a23a1e620ef871384ddf9611ccdcfb893fbf013cc203ac8e72f745420f1eea0",
"blk.10.ffn_norm.weight": "e3a375e43c349a1c6c66c22328e513cc1af3137fe839e43dc8e9be2f65914fd7",
"blk.10.ffn_up.weight": "5d182e7c94369194fca5f19cbbe668a999911e57f3d363bc7fb6088428700cb9",
"blk.10.post_attention_norm.weight": "b841c6308296e8984f3c5f549c6e3a242f4b3e19141e1f54cc08de9c46759c09",
"blk.10.post_ffw_norm.weight": "9d66fa05b5c940208f634f5053d809094c99a2a10a1d1e8847c8281fbd99fb49",
"blk.11.attn_k.weight": "14adf24ebb2bb17b336ca81cec3e690fd854782f4440ca6c66cc1d7e7bf1c850",
"blk.11.attn_norm.weight": "2d2213f311f50414702b5b34f22aafb9d9a0b6787243e7578562583dc40ad195",
"blk.11.attn_output.weight": "de1f14cc2a7fff00cf11b229f0576999205f17b9536e97abc9d6de3cc79a7884",
"blk.11.attn_q.weight": "2bcc5c147524003109ece0be08b89ac8b25baa71416ffa76573c6c052ffc6eea",
"blk.11.attn_v.weight": "2e6ab8573070c22dc1e0d7aebe4d52123226dacf7822dcce06fadbb38fb036a4",
"blk.11.ffn_down.weight": "1b86902f4e36868421e5228b9445051f8290b292df22a6d1af836dcecc1f25c3",
"blk.11.ffn_gate.weight": "e756e8081bd0a16aea4a9ef5076ad102113524f7a3d50a3a77aaa7f7938b63e8",
"blk.11.ffn_norm.weight": "6913887267be227cf9d1991a3dd8db2e7e74bb9b5fbdfcb9ac954fd7d7b95b3b",
"blk.11.ffn_up.weight": "619a3ac0609ebdf42c3fb2b6e4b1db48df79e6dd8418d7ab8f1bbff13d8a6a50",
"blk.11.post_attention_norm.weight": "e4b4ba92cef7b6a78407e8ab1b0307d47dac6c3df7b6817e28038317ff662d7e",
"blk.11.post_ffw_norm.weight": "40aceeec58cb855f0c158c9cc217168fcd5d0e735567d587217b1d78df17bc5f",
"blk.12.attn_k.weight": "c54c5a4d4892522022d1aa2204cfc624f0b4042caa536e678967316293fe5cb1",
"blk.12.attn_norm.weight": "7cd2ef58298569ffdf244d9b390f3917245276c8206e5780af5f96d8c0bbb446",
"blk.12.attn_output.weight": "85495ef9cc8b3deb21f741bde463ff6493acae2be51f02ecdeef952cbdec3375",
"blk.12.attn_q.weight": "d19383f83fd119bfb8c0280c9515705c11d8e7d502019fcf8f49efeef0d106d0",
"blk.12.attn_v.weight": "869ac669ba49531d9128892a0e27cef15de508ff40cdf80cc1681dde50d09204",
"blk.12.ffn_down.weight": "578f39f8f9fc2f09138afc884a952d7cc3a9a31de4216acd10e88e19e0b75f8c",
"blk.12.ffn_gate.weight": "e29a0186bc6c4a0720246306e922d3a83f777dadcf4ac80bad468287031cc8b5",
"blk.12.ffn_norm.weight": "e1ee95c6584b5cb57fcf1db8ce2bcc03aff91eb389238c094a61c00dde93d1f2",
"blk.12.ffn_up.weight": "2a826f06d7cdfb3edc6ae250ff44363ef77a2a9cdf96313e23a331b99ebfa17d",
"blk.12.post_attention_norm.weight": "4bafc7699b948d5cbc0d3e09b418b06c6abc4651a61ada9609d9a2f21c7e5607",
"blk.12.post_ffw_norm.weight": "bbb8c34a7176bb1a49f9fe2bacca0bd26b673d52c0835b2e90fa11f2962f077f",
"blk.13.attn_k.weight": "ffeefccfe8255d1b694382012ff4134eee5fec9d9491c8d0ff0a13832d1a37e8",
"blk.13.attn_norm.weight": "35713726529e3887c4135a88e86e8a4d7270ba5b9f2d1ab462622fbf40a7cdce",
"blk.13.attn_output.weight": "0d60b7c5cd71190a9ef4b873b0f516be15447c32d83914db2794b14592b0b460",
"blk.13.attn_q.weight": "8296069e65bef794cefc61257fc65789b3cb22955e30f3df129205e5041b2222",
"blk.13.attn_v.weight": "ca0f4ab9d16a748fc643a5c0c7a19826a811bf2a4e7316a8c935d4bf0ce8abc6",
"blk.13.ffn_down.weight": "d5514e0c8e7b3ed1cbcc1605eb5be1733b6ab3514cf8a0508fc72f7d05ed8bcb",
"blk.13.ffn_gate.weight": "8108e517a82e08a3aefbbd267bfa50a1668f92a76273280ce8a6bc1f6dd61521",
"blk.13.ffn_norm.weight": "5fcb6132d2134bf1f835b904a99820fa501dbc57d2224129f7098bf3cabc1d36",
"blk.13.ffn_up.weight": "6d744b7cd390a3cae3aa350dd379b81246acd056a2259996b6aaadece8465ccc",
"blk.13.post_attention_norm.weight": "e08b14698912509790e9575b8676971fbb0a4d82d719367e3756c0d0c4ab8cc0",
"blk.13.post_ffw_norm.weight": "2b196e4450fc5f1e7367b2cf7fe33a15fe919fbcdd861d11002346f16e980535",
"blk.14.attn_k.weight": "120e5f48d7268dfd9ab5f4bc9cc57a7cec63ea9635f56b80d435eb22936e9483",
"blk.14.attn_norm.weight": "146367bcce4db72cc894419a2e0145a6f533507dd68e4739c10ee480308c401f",
"blk.14.attn_output.weight": "720fa0165e756876c5cb6ad9e2780dd910390933f3f8849e5add5da04266650b",
"blk.14.attn_q.weight": "f5183466f56219ca1aca52d8b82c2d966a4198fea40fdd6b39f4d8b06ca2a6dd",
"blk.14.attn_v.weight": "24f8ea3d5512cd37c43c8329cb0da0c90d1895aef763ac2dcee3fe5157ec50a2",
"blk.14.ffn_down.weight": "e29960965b384ae5ab3d898a4dbaa8fddd28fa0e477ac28bcac49dec12a5ac67",
"blk.14.ffn_gate.weight": "6d0d6a74bfe9692e8f8eedff0fc34fc4fa1c8687794f35f2e2b033ab2d7510b8",
"blk.14.ffn_norm.weight": "f7036c1a9a71e046c9d2af16e9218fda5dbb0f7241ab44747abed1f0f9d602ca",
"blk.14.ffn_up.weight": "7d69ea1424007ffc9c12247dd0308c616e93ac02a59ec341cfa48f92d6ce3b10",
"blk.14.post_attention_norm.weight": "65b9712834d9445d4236bec362f3fb795c20d60c541b3dc6dbb7914d9b493e41",
"blk.14.post_ffw_norm.weight": "9c6a8da2e4e437d5cfdf3b9097e9f8b64bf07946a048badec20f4d374613f38f",
"blk.15.attn_k.weight": "864bc618303a0e4ee67fb1d5e751de61e936cd51e96669dd86f8cd08f2305045",
"blk.15.attn_norm.weight": "f9f4187da6eeadc2fc5921d8fe669741697d16c13d71e4aaeb73b82f50dc577e",
"blk.15.attn_output.weight": "ce2419a0b097036b2a31f2f4ad731d5814bcc2ef4c511786e24471e5eefd273b",
"blk.15.attn_q.weight": "9539db5a970d11ebe99722d1e13fcd635e250033630811efe583d2f97778e4a9",
"blk.15.attn_v.weight": "1c834b48ccd88adaeabb7d8bcb6be0bcd6d5ac1354ce88fc28f19a1a96b81ab3",
"blk.15.ffn_down.weight": "bc1f97a65dde6fa2c1e5397afb612266944b343f2eaa868b635ddd25829f8a42",
"blk.15.ffn_gate.weight": "1b14529d57056b79037f6cb5008132e62cc35992353b38dda59572274623103b",
"blk.15.ffn_norm.weight": "9af77458de9ee55c66f93865759f9c2c398557f94f3fa8fa6af30543d7339cde",
"blk.15.ffn_up.weight": "41d524a26b61a9595816b4fd53cf57ef50a702e4ef32933ff6136dca9136a267",
"blk.15.post_attention_norm.weight": "c60a03cd0e63a7db5c80015e58e9b97ba2208caa19f66a6fef5c4447eca900ce",
"blk.15.post_ffw_norm.weight": "34f7f9f96769215bbc3d17084df091864aef96a6645b7d0b3b7d9bd92f1a4b0b",
"blk.16.attn_k.weight": "7e27240d9f3a8c6cf0f4a980113d43234f514eadc3e3e1792b86efb29ffb1a6d",
"blk.16.attn_norm.weight": "af798acc0899282a30448edec48223b3e8efda177090273e612d8eca5e377301",
"blk.16.attn_output.weight": "79df39a3709d3d53e84146291e0944a7a653d06705293d9ccb5648dceadb432c",
"blk.16.attn_q.weight": "db58a1c3b83ad294804e5fd7321005719e200659173466df5a52a182b80b7165",
"blk.16.attn_v.weight": "2af6d48cbaeb225b5c1a704f76abd89c8ab1521417695b112b4dcc2cbd39b74d",
"blk.16.ffn_down.weight": "fc1c813eb5e7da3d6194569d6cb21602fc6eff2dc8e1b0eb753f2d5df148189c",
"blk.16.ffn_gate.weight": "7a80bcbc42464bd55df4814a6edbd7b5c153e0428323bbe49de55e2d2add33e7",
"blk.16.ffn_norm.weight": "2041685ee926d30f3f2ae4ec35b5688f1cd834167a6359a7d4057eac804c58b2",
"blk.16.ffn_up.weight": "8da4b718973ac1d43b928829bc45e062fd101984d6c98dd825bd7c5d08ebfbe3",
"blk.16.post_attention_norm.weight": "975c48fe680a6167438a106140a8872eee7765191f152d80e3b8ddf47693e095",
"blk.16.post_ffw_norm.weight": "4de2d4d483acfe4fc77860ea929025df2f4e15c10729413f36a18c94eaa6d689",
"blk.17.attn_k.weight": "f937e61f0af8c4cd98ee742648eb60e02e579683e21d421071295a3b70aebaad",
"blk.17.attn_norm.weight": "c3270583ed28b7e423f5b170c59113234f258169b93a867d9274f4c10b7cb115",
"blk.17.attn_output.weight": "b8c1150e81e685e539a5dcf2c19047a24eba2b281fabe166674b1d71ef4612ea",
"blk.17.attn_q.weight": "c255100ae2011e7dc7e3bf3bc3ccd96d859fbb98581cae993d7b82c1ba8e8b39",
"blk.17.attn_v.weight": "5830bb0a555984c6485348067f70b5d22ae337c011aa9248dac2ff4c95944551",
"blk.17.ffn_down.weight": "8ff9a7cccaa3776434a9d895aae4fb5c36c736bf2ec98784226b4c234940fbb0",
"blk.17.ffn_gate.weight": "1b52876739712831c272911533da206f407b46034a1a4ae8a88c1f96b6bd5747",
"blk.17.ffn_norm.weight": "d0e16ba5e87c91b545334e022058c7d03849665c3b1a6298771b656531366b66",
"blk.17.ffn_up.weight": "4dd6211d01dbebbe21052708eddc242b082a58b5f18ed16479e17987c1d3432e",
"blk.17.post_attention_norm.weight": "6f49c775c7417dade77ba8268a0f8441c1e5ec28b5d7e4dc5ed07a04d04600c8",
"blk.17.post_ffw_norm.weight": "b91a0bb2e6679e9c9be06ad323adae441d00a3d673efb19d7c4954be2aa84b27",
"blk.18.attn_k.weight": "22b565ace1b4da8b33865a58625be1d90beea9891f29686a69fa9cf7c93217db",
"blk.18.attn_norm.weight": "3e0160d7063c8753de65d2356a66648e47d921efdc5c917efb8209892120f8db",
"blk.18.attn_output.weight": "e3180f0bb4ca90b31e9b08158db38e332de62dfbaefe34aa94cc316409331e09",
"blk.18.attn_q.weight": "f3a5a83614c3ba7ea41cdd5b1b0819a241ee2a951a381ce4a9e001d3f700ed8f",
"blk.18.attn_v.weight": "f3350a5984fb951fc738adcf78147e6d812ff1c576670c460cafc99c253c1654",
"blk.18.ffn_down.weight": "9e9d09b13a33525e14bdaee6efc65c551ac7cf7680e534b940ab122a3a7c1ac9",
"blk.18.ffn_gate.weight": "ebaec8b4b578a2e8d815baac12f1675c208f80c68074d5a18288a2e1a60680ee",
"blk.18.ffn_norm.weight": "33e7687c53a242f2f8dc7093a491c97b18d4a5a8c14d183f02bd586a770f05aa",
"blk.18.ffn_up.weight": "78a1816662378ce56cc870e705174492781897b3afd2d4d97a51f10f2f2987c1",
"blk.18.post_attention_norm.weight": "a58dde3f12df3e94cbc27d87c8ea86f89af8a388a506446ff6758f05399b05fc",
"blk.18.post_ffw_norm.weight": "cebf90cc143577d483cca27b032dfd82031ee59bdf17c0e2cf60a0a3ad5bf996",
"blk.19.attn_k.weight": "4683375d0599ac9e2232196aae1e90af13a14cae26e865465de5c8e257bb2055",
"blk.19.attn_norm.weight": "f3eba936bfb1814bbcb0a1d62739eb66daac839df8c9c836fe0e94860df88525",
"blk.19.attn_output.weight": "51c0f01d38a9dcfe9bdbc4643576fab164c1d9e4b7168b7695c0ee55e6965667",
"blk.19.attn_q.weight": "28d15b69b8416f2e7ddc88fe381cb1e2ef2ad705fb1c268139ba96498cc74848",
"blk.19.attn_v.weight": "6860f1cd720638e63a981fa2c0b4db900129826bcb9823c9ddf9fb8b1b9f3383",
"blk.19.ffn_down.weight": "bc7f2d7827ee01c2dd41401c7b3b1700ad3a4ff620e8bb734f92630d342dcc7f",
"blk.19.ffn_gate.weight": "54d03ef69ba373fc410fbca8f1e34a565d58e4296d9a035ff7e48340b9c848e7",
"blk.19.ffn_norm.weight": "9178fc796a340ee6e8128ca74c0cb6203d1adbed6927af4e5ac7863da57affc7",
"blk.19.ffn_up.weight": "a77bd708026c6e83ad5c79c223278e74621bcf74a9641c7818d96b595daaad20",
"blk.19.post_attention_norm.weight": "ae94aa26f4c411bf9496a6fd4a6df64ee589ee1ae9a04b531d45acc95721e582",
"blk.19.post_ffw_norm.weight": "9ad210700edeef12133bdcff04bf1c7f62b49f6f4a9ba483c7cdc59857c24a5c",
"blk.20.attn_k.weight": "e35bce1e9f4a7a09ef34721f57ea38cfca68c272f52d923fe50af8308f66cfaa",
"blk.20.attn_norm.weight": "644800f6926fd34f233795c4dec1151a295d2138ca8cac33e3e48167d26f8b41",
"blk.20.attn_output.weight": "8d3758cd236471741e1ad66c0710cb79077dc8c7a3a292d35bc551c0c5abe627",
"blk.20.attn_q.weight": "c333b1f0f6f956b5d73891df10b1a0321e55fc31c40d623a24e1f52caa6a998b",
"blk.20.attn_v.weight": "8562b418d0c4868a050fb19fa3fcaf50a8cf1c669f537d666c80c7b3a04714e1",
"blk.20.ffn_down.weight": "97efb608ac44cc804198faec3ee66eafe56ced6b7ca5359700c6f1df75b7205e",
"blk.20.ffn_gate.weight": "5c61151d86f28415c73c73d90ec088c646cbe5c1640197caf58eb501ba7db293",
"blk.20.ffn_norm.weight": "24bbe0a701afd4bbeea65b3edde712b3cbb2281043bbc43dbf250582453116ed",
"blk.20.ffn_up.weight": "e170cf68e249566aa99eb6f6b265679bf9a5a6b76830ba24e7e130c2515910c4",
"blk.20.post_attention_norm.weight": "e092d751cfe20dbf2d348358f3b38397bd83e4ed94d6bbaa6bbaddcd902b2ac4",
"blk.20.post_ffw_norm.weight": "219a18a47dcba76e669e4322223a5a9227bd3db1de3fbd3d3cfb22e54a783c5a",
"blk.21.attn_k.weight": "c3a095ebddb42c63824f1c98da65263dc88e4d790a26aa1632840b44f5cc7cb1",
"blk.21.attn_norm.weight": "ef8bbaded5fbc45ad9cf3985ae02174524e7090fe6362811124f942ef643bec7",
"blk.21.attn_output.weight": "668f018aba72baac6252aa3ad58569ddd55ab751a0dd8d7bcc9fb9b6efb4bf53",
"blk.21.attn_q.weight": "e759c65663089f3bbbd51847934c185e680c82f1249065d5d487da638e519e6d",
"blk.21.attn_v.weight": "2ff57762686cf9ba1f5a6be76503454b97556ce67f4ac98254bd0562231197ba",
"blk.21.ffn_down.weight": "3fd106556fb721b1c28ae3f4026bc83eb1b08ed910f2ba5f466c6b5f327d91cb",
"blk.21.ffn_gate.weight": "338022d882f4b6619e8054a6fb909696fa3eef3013cf69b65c3cacdfc5b9e42c",
"blk.21.ffn_norm.weight": "1e77660c23a3f9653ee721a863d1960f773d87437cabc4dc0a6e17ee3d4e5e44",
"blk.21.ffn_up.weight": "7d31b20fbc2e6eba8f350f170069dc36f0cb12f68fbc4206ec5022a74085ebcb",
"blk.21.post_attention_norm.weight": "9638bae8d8bdcd7ed68da282979cd84a07c41ff9cabcaea94ebc846a1803db23",
"blk.21.post_ffw_norm.weight": "d622ef11115fe0cbe04b727d5a3b6371e7f39bf08c8d5eb9bc6da52e3f3cfb9d",
"blk.22.attn_k.weight": "5c321cb29deffbe57de200dd206a62005f1e80acb86c4fd2349dd44c8d3594fd",
"blk.22.attn_norm.weight": "198d949705d7170a331d75889d8c7500c3635254dac2cc6aa4dc35d556584536",
"blk.22.attn_output.weight": "19805cd5d7025b457e5d41d70db8b3fd63c2dd0e4a94d3ef1704d50ef4e749e8",
"blk.22.attn_q.weight": "177836cd583fc87405975ddc21ebfebdaa090a0363799664c72caa3da851ae2c",
"blk.22.attn_v.weight": "fea255692483e30d0108f9e4e250eb3ed7dbda8d83f499b06519b8c223ae6096",
"blk.22.ffn_down.weight": "00cb8939f03e5817d6d412de8cf2c923c9568d5493e382cec7faf5718fb034eb",
"blk.22.ffn_gate.weight": "b0591065b91281b2fbd8a9567f3568d40479f680e1f0a29e27ae213f37642489",
"blk.22.ffn_norm.weight": "96b5c5d0737c2ceb8fc869f54adb9e5f46e28cb7b177c40f49fa926b923c00f8",
"blk.22.ffn_up.weight": "81f472185b24344ab0594ea8246cc6e200e0dc1cab4943e74fbe4ca19d5a9701",
"blk.22.post_attention_norm.weight": "27fa9aa6260aa3071e0391e1a1d49322dcb6e8072315b8a9b7064087108dbd06",
"blk.22.post_ffw_norm.weight": "f37e1dcd7f643d9545675ffe9dc527a11eba86eb204989c2f44f636b266d896a",
"blk.23.attn_k.weight": "5d82f36658a56c3f94d0bb2d61f65509c966fa6568f81812e0d3e338b380ef8c",
"blk.23.attn_norm.weight": "b7983f88d9cad88bc88a528923e6da592ad20e699965b223ebc10840fe1f4fec",
"blk.23.attn_output.weight": "59f97f80f430d71606aab0158a195aed29ccd3405e6c0a5c41c809be8eb01898",
"blk.23.attn_q.weight": "53ac4789fe958919cc02ea4222bcd64c0ea1b4baa54304bff46635bdf42f7490",
"blk.23.attn_v.weight": "ec8abe09b9e84dbb52c7a068094657c6d3c62fe551ba8d7c3a3f23da622e9756",
"blk.23.ffn_down.weight": "3cf547eccb1b82aa64f208cee9682d7f558ca84e0aead7d9d3d1420d90f3d992",
"blk.23.ffn_gate.weight": "366aa2486d911ba81eb519119e13807deacf7e9908bc1975a2a63e00d6b10124",
"blk.23.ffn_norm.weight": "6d1d4a4af34bb7dc090ac87d6457d398c3e0fb68bd2e2b60b099dc318b6cfac3",
"blk.23.ffn_up.weight": "53f76692e253f5d2420b3f200c731b9f3b7a83e379920b4a067c729b4674aa4d",
"blk.23.post_attention_norm.weight": "7c952fa0efa76b3f048c8c4c9e8dcb5e3724d231327eda6423a34d3f3d3367de",
"blk.23.post_ffw_norm.weight": "7ab188cfe61f0a91b40309a0ab6bfa99f19d0ff2a37b6ac10e5f0c7f44eb5270",
"blk.24.attn_k.weight": "225798792f9bfdd10eff0505ebe61e0aad0209c17b431f6044ee7968ffe8c198",
"blk.24.attn_norm.weight": "635e3c1ebf5219bbebfc40ef164bc32d2b726ef595a94da64ac524ae878e2915",
"blk.24.attn_output.weight": "482f5bb2db8d9ed22b253d9a3296333b239efe698e5992e5d77e7e12dc2a5cf5",
"blk.24.attn_q.weight": "43805bbccddb65d58fffc4be9b5c374d4e1df1395ec1e1ffb4bcff03e98d5adb",
"blk.24.attn_v.weight": "fa741af54b4a3b1775d32f59134756090c5df2e7345a12a2d8db94fe289667a7",
"blk.24.ffn_down.weight": "83c6351e3162626b276f524a57836144625c2556dbe321b57cbd8fd486a68fab",
"blk.24.ffn_gate.weight": "fbe66be0d84d12cea5176cc7eaef64382ffc7324cd9d6266a3342dc43442f2ac",
"blk.24.ffn_norm.weight": "77c1445a8639ad24938bdf0280233eea2362d47391421833dfa72ec756dfc1e8",
"blk.24.ffn_up.weight": "78235ac729ee23c1cf1ae543751e3af32776d8808cee6e529c2a625a1f027654",
"blk.24.post_attention_norm.weight": "161f71b6d07628d43e4ae51a4c9088ec6ca2db123a17986a14505d83fdd04dad",
"blk.24.post_ffw_norm.weight": "cf1ba692aa683368b02ac413e69b2521b98c69a5274eacbb54165b53bf38a8b2",
"blk.25.attn_k.weight": "057a56bd8c8d2b41608d1f71faa3052902152ddf85e47669ad950c1c3e77c33f",
"blk.25.attn_norm.weight": "b7179fe02c334da556ddcf6c1b502245639a728c4cbba8b552d8e1df4565ee9d",
"blk.25.attn_output.weight": "4fed8b05b08a0ff75ffd022701bbeb52f17b23d09332a1ddcba737244bd0d3b0",
"blk.25.attn_q.weight": "c52e99f5d38bf7538d6106a0bbf38ac6dc6296bca9a3f849afa384ea67b4af01",
"blk.25.attn_v.weight": "c49c23d8e1cfa6a8eb971eb69942204890c6d7d830dc8774c84b108a80598912",
"blk.25.ffn_down.weight": "c08d4dc8412b19fdc870c164b83c341b236ec6fe7bb4a9bcfe0dc100faa20286",
"blk.25.ffn_gate.weight": "1a4cb3f36735d59181721471452807903006539e5e1b5ceb4f72d1d7ae134127",
"blk.25.ffn_norm.weight": "8fd6bd0dcec5198761525a36992a57c9ec5e9da60a22092839a84ae8c4e87f26",
"blk.25.ffn_up.weight": "3a00f39bdd5f31dc5e3b281d2002e1ac4f2475d49a0ac1d7720a25b377dcd04a",
"blk.25.post_attention_norm.weight": "e5f31a648612c859b6d21c9ee426e87a86cb1973dfdd86276c767371d9cef5ad",
"blk.25.post_ffw_norm.weight": "553c3bd774922c99c2384380a142d019881d30dbf0fe3bf9430dabfb3f6cbd33",
"output_norm.weight": "49445c4585ab0a8135717a0bdb1cda4a062a030177d0119561d91542aec5744b"
}

View File

@@ -100,21 +100,8 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
}
if template, ok := p["chat_template"]; ok {
var s []struct {
Name string `json:"name"`
Template string `json:"template"`
}
if err := json.Unmarshal(template, &t.Template); err == nil {
// noop
} else if err := json.Unmarshal(template, &s); err == nil {
for _, e := range s {
if e.Name == "default" {
t.Template = e.Template
break
}
}
} else {
return nil, fmt.Errorf("invalid chat_template: %w", err)
if err := json.Unmarshal(template, &t.Template); err != nil {
return nil, err
}
}
@@ -154,6 +141,7 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
}
type tokenizer struct {
Version string `json:"version"`
AddedTokens []token `json:"added_tokens"`
Model struct {
Type string `json:"type"`
@@ -251,7 +239,7 @@ func parseVocabulary(fsys fs.FS) (*Vocabulary, error) {
return pattern.Func(fsys)
}
return nil, errors.New("unknown tokenizer format")
return nil, errors.New("unknown tensor format")
}
type SpecialVocabulary struct {

View File

@@ -1,208 +0,0 @@
package convert
import (
"io"
"io/fs"
"os"
"path/filepath"
"strings"
"testing"
"github.com/google/go-cmp/cmp"
)
func createTokenizerFS(t *testing.T, dir string, files map[string]io.Reader) fs.FS {
t.Helper()
for k, v := range files {
if err := func() error {
f, err := os.Create(filepath.Join(dir, k))
if err != nil {
return err
}
defer f.Close()
if _, err := io.Copy(f, v); err != nil {
return err
}
return nil
}(); err != nil {
t.Fatalf("unexpected error: %v", err)
}
}
return os.DirFS(dir)
}
func TestParseTokenizer(t *testing.T) {
cases := []struct {
name string
fsys fs.FS
specialTokenTypes []string
want *Tokenizer
}{
{
name: "string chat template",
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
"tokenizer.json": strings.NewReader(`{}`),
"tokenizer_config.json": strings.NewReader(`{
"chat_template": "<default template>"
}`),
}),
want: &Tokenizer{
Vocabulary: &Vocabulary{Model: "gpt2"},
Pre: "default",
Template: "<default template>",
},
},
{
name: "list chat template",
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
"tokenizer.json": strings.NewReader(`{}`),
"tokenizer_config.json": strings.NewReader(`{
"chat_template": [
{
"name": "default",
"template": "<default template>"
},
{
"name": "tools",
"template": "<tools template>"
}
]
}`),
}),
want: &Tokenizer{
Vocabulary: &Vocabulary{Model: "gpt2"},
Pre: "default",
Template: "<default template>",
},
},
{
name: "added tokens",
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
"tokenizer.json": strings.NewReader(`{
"added_tokens": [
{
"id": 999,
"content": "<unused999>",
"special": false
}
]
}`),
}),
want: &Tokenizer{
Vocabulary: &Vocabulary{
Model: "gpt2",
Tokens: []string{"<unused999>"},
Scores: []float32{999},
Types: []int32{4},
},
Pre: "default",
},
},
{
name: "added tokens overlap vocab",
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
"tokenizer.json": strings.NewReader(`{
"added_tokens": [
{
"id": 0,
"content": "<pad>",
"special": true
}
],
"model": {
"vocab": {
"<pad>": 0
}
}
}`),
}),
want: &Tokenizer{
Vocabulary: &Vocabulary{
Model: "gpt2",
Tokens: []string{"<pad>"},
Scores: []float32{0},
Types: []int32{3},
},
Pre: "default",
},
},
{
name: "special token types",
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
"tokenizer.json": strings.NewReader(`{
"added_tokens": [
{
"id": 0,
"content": "<pad>",
"special": true
},
{
"id": 1,
"content": "<eos>",
"special": true
},
{
"id": 2,
"content": "<bos>",
"special": true
},
{
"id": 3,
"content": "<unk>",
"special": true
}
],
"model": {
"vocab": {
"<pad>": 0,
"<eos>": 1,
"<bos>": 2,
"<unk>": 3
}
}
}`),
"tokenizer_config.json": strings.NewReader(`{
"add_bos_token": true,
"add_eos_token": false,
"bos_token": "<bos>",
"eos_token": "<eos>",
"pad_token": "<pad>",
"unk_token": "<unk>"
}`),
}),
specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
want: &Tokenizer{
Vocabulary: &Vocabulary{
Model: "gpt2",
Tokens: []string{"<pad>", "<eos>", "<bos>", "<unk>"},
Scores: []float32{0, 1, 2, 3},
Types: []int32{3, 3, 3, 3},
},
SpecialVocabulary: []*SpecialVocabulary{
{Type: "pad", Content: "<pad>", ID: 0, AddToken: false},
{Type: "eos", Content: "<eos>", ID: 1, AddToken: false},
{Type: "bos", Content: "<bos>", ID: 2, AddToken: true},
{Type: "unk", Content: "<unk>", ID: 3, AddToken: false},
},
Pre: "default",
},
},
}
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
tokenizer, err := parseTokenizer(tt.fsys, tt.specialTokenTypes)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if diff := cmp.Diff(tt.want, tokenizer); diff != "" {
t.Errorf("unexpected tokenizer (-want +got):\n%s", diff)
}
})
}
}

View File

@@ -69,7 +69,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llama3.2",
"model": "llama3",
"prompt": "Why is the sky blue?"
}'
```
@@ -80,7 +80,7 @@ A stream of JSON objects is returned:
```json
{
"model": "llama3.2",
"model": "llama3",
"created_at": "2023-08-04T08:52:19.385406455-07:00",
"response": "The",
"done": false
@@ -102,7 +102,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
```json
{
"model": "llama3.2",
"model": "llama3",
"created_at": "2023-08-04T19:22:45.499127Z",
"response": "",
"done": true,
@@ -124,7 +124,7 @@ A response can be received in one reply when streaming is off.
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llama3.2",
"model": "llama3",
"prompt": "Why is the sky blue?",
"stream": false
}'
@@ -136,7 +136,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
```json
{
"model": "llama3.2",
"model": "llama3",
"created_at": "2023-08-04T19:22:45.499127Z",
"response": "The sky is blue because it is the color of the sky.",
"done": true,
@@ -194,7 +194,7 @@ curl http://localhost:11434/api/generate -d '{
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llama3.2",
"model": "llama3",
"prompt": "What color is the sky at different times of the day? Respond using JSON",
"format": "json",
"stream": false
@@ -205,7 +205,7 @@ curl http://localhost:11434/api/generate -d '{
```json
{
"model": "llama3.2",
"model": "llama3",
"created_at": "2023-11-09T21:07:55.186497Z",
"response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
"done": true,
@@ -327,7 +327,7 @@ If you want to set custom options for the model at runtime rather than in the Mo
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llama3.2",
"model": "llama3",
"prompt": "Why is the sky blue?",
"stream": false,
"options": {
@@ -368,7 +368,7 @@ curl http://localhost:11434/api/generate -d '{
```json
{
"model": "llama3.2",
"model": "llama3",
"created_at": "2023-08-04T19:22:45.499127Z",
"response": "The sky is blue because it is the color of the sky.",
"done": true,
@@ -390,7 +390,7 @@ If an empty prompt is provided, the model will be loaded into memory.
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llama3.2"
"model": "llama3"
}'
```
@@ -400,40 +400,13 @@ A single JSON object is returned:
```json
{
"model": "llama3.2",
"model": "llama3",
"created_at": "2023-12-18T19:52:07.071755Z",
"response": "",
"done": true
}
```
#### Unload a model
If an empty prompt is provided and the `keep_alive` parameter is set to `0`, a model will be unloaded from memory.
##### Request
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llama3.2",
"keep_alive": 0
}'
```
##### Response
A single JSON object is returned:
```json
{
"model": "llama3.2",
"created_at": "2024-09-12T03:54:03.516566Z",
"response": "",
"done": true,
"done_reason": "unload"
}
```
## Generate a chat completion
```shell
@@ -472,7 +445,7 @@ Send a chat message with a streaming response.
```shell
curl http://localhost:11434/api/chat -d '{
"model": "llama3.2",
"model": "llama3",
"messages": [
{
"role": "user",
@@ -488,7 +461,7 @@ A stream of JSON objects is returned:
```json
{
"model": "llama3.2",
"model": "llama3",
"created_at": "2023-08-04T08:52:19.385406455-07:00",
"message": {
"role": "assistant",
@@ -503,7 +476,7 @@ Final response:
```json
{
"model": "llama3.2",
"model": "llama3",
"created_at": "2023-08-04T19:22:45.499127Z",
"done": true,
"total_duration": 4883583458,
@@ -521,7 +494,7 @@ Final response:
```shell
curl http://localhost:11434/api/chat -d '{
"model": "llama3.2",
"model": "llama3",
"messages": [
{
"role": "user",
@@ -536,7 +509,7 @@ curl http://localhost:11434/api/chat -d '{
```json
{
"model": "llama3.2",
"model": "registry.ollama.ai/library/llama3:latest",
"created_at": "2023-12-12T14:13:43.416799Z",
"message": {
"role": "assistant",
@@ -560,7 +533,7 @@ Send a chat message with a conversation history. You can use this same approach
```shell
curl http://localhost:11434/api/chat -d '{
"model": "llama3.2",
"model": "llama3",
"messages": [
{
"role": "user",
@@ -584,7 +557,7 @@ A stream of JSON objects is returned:
```json
{
"model": "llama3.2",
"model": "llama3",
"created_at": "2023-08-04T08:52:19.385406455-07:00",
"message": {
"role": "assistant",
@@ -598,7 +571,7 @@ Final response:
```json
{
"model": "llama3.2",
"model": "llama3",
"created_at": "2023-08-04T19:22:45.499127Z",
"done": true,
"total_duration": 8113331500,
@@ -656,7 +629,7 @@ curl http://localhost:11434/api/chat -d '{
```shell
curl http://localhost:11434/api/chat -d '{
"model": "llama3.2",
"model": "llama3",
"messages": [
{
"role": "user",
@@ -674,7 +647,7 @@ curl http://localhost:11434/api/chat -d '{
```json
{
"model": "llama3.2",
"model": "registry.ollama.ai/library/llama3:latest",
"created_at": "2023-12-12T14:13:43.416799Z",
"message": {
"role": "assistant",
@@ -696,7 +669,7 @@ curl http://localhost:11434/api/chat -d '{
```
curl http://localhost:11434/api/chat -d '{
"model": "llama3.2",
"model": "llama3.1",
"messages": [
{
"role": "user",
@@ -735,7 +708,7 @@ curl http://localhost:11434/api/chat -d '{
```json
{
"model": "llama3.2",
"model": "llama3.1",
"created_at": "2024-07-22T20:33:28.123648Z",
"message": {
"role": "assistant",
@@ -763,64 +736,6 @@ curl http://localhost:11434/api/chat -d '{
}
```
#### Load a model
If the messages array is empty, the model will be loaded into memory.
##### Request
```
curl http://localhost:11434/api/chat -d '{
"model": "llama3.2",
"messages": []
}'
```
##### Response
```json
{
"model": "llama3.2",
"created_at":"2024-09-12T21:17:29.110811Z",
"message": {
"role": "assistant",
"content": ""
},
"done_reason": "load",
"done": true
}
```
#### Unload a model
If the messages array is empty and the `keep_alive` parameter is set to `0`, a model will be unloaded from memory.
##### Request
```
curl http://localhost:11434/api/chat -d '{
"model": "llama3.2",
"messages": [],
"keep_alive": 0
}'
```
##### Response
A single JSON object is returned:
```json
{
"model": "llama3.2",
"created_at":"2024-09-12T21:33:17.547535Z",
"message": {
"role": "assistant",
"content": ""
},
"done_reason": "unload",
"done": true
}
```
## Create a Model
```shell
@@ -989,7 +904,7 @@ Show information about a model including details, modelfile, template, parameter
```shell
curl http://localhost:11434/api/show -d '{
"name": "llama3.2"
"name": "llama3"
}'
```
@@ -1050,7 +965,7 @@ Copy a model. Creates a model with another name from an existing model.
```shell
curl http://localhost:11434/api/copy -d '{
"source": "llama3.2",
"source": "llama3",
"destination": "llama3-backup"
}'
```
@@ -1105,7 +1020,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
```shell
curl http://localhost:11434/api/pull -d '{
"name": "llama3.2"
"name": "llama3"
}'
```

View File

@@ -148,22 +148,3 @@ In addition to the common Windows development tools described above, install AMD
- [Strawberry Perl](https://strawberryperl.com/)
Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
#### Windows arm64
The default `Developer PowerShell for VS 2022` may default to x86 which is not what you want. To ensure you get an arm64 development environment, start a plain PowerShell terminal and run:
```powershell
import-module 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\Microsoft.VisualStudio.DevShell.dll'
Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community' -skipautomaticlocation
```
You can confirm with `write-host $env:VSCMD_ARG_TGT_ARCH`
Follow the instructions at https://www.msys2.org/wiki/arm64/ to set up an arm64 msys2 environment. Ollama requires gcc and mingw32-make to compile, which is not currently available on Windows arm64, but a gcc compatibility adapter is available via `mingw-w64-clang-aarch64-gcc-compat`. At a minimum you will need to install the following:
```
pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw-w64-clang-aarch64-make make
```
You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)

View File

@@ -63,7 +63,7 @@ docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 114
Now you can run a model:
```
docker exec -it ollama ollama run llama3.2
docker exec -it ollama ollama run llama3.1
```
### Try different models

View File

@@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:
```shell
curl http://localhost:11434/api/generate -d '{
"model": "llama3.2",
"model": "llama3",
"prompt": "Why is the sky blue?",
"options": {
"num_ctx": 4096
@@ -194,8 +194,6 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e
If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory.
> Note: on Linux using the standard installer, the `ollama` user needs read and write access to the specified directory. To assign the directory to the `ollama` user run `sudo chown -R ollama:ollama <directory>`.
Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
## How can I use Ollama in Visual Studio Code?
@@ -232,18 +230,14 @@ curl http://localhost:11434/api/chat -d '{"model": "mistral"}'
To preload a model using the CLI, use the command:
```shell
ollama run llama3.2 ""
ollama run llama3.1 ""
```
## How do I keep a model loaded in memory or make it unload immediately?
By default models are kept in memory for 5 minutes before being unloaded. This allows for quicker response times if you're making numerous requests to the LLM. If you want to immediately unload a model from memory, use the `ollama stop` command:
By default models are kept in memory for 5 minutes before being unloaded. This allows for quicker response times if you are making numerous requests to the LLM. You may, however, want to free up the memory before the 5 minutes have elapsed or keep the model loaded indefinitely. Use the `keep_alive` parameter with either the `/api/generate` and `/api/chat` API endpoints to control how long the model is left in memory.
```shell
ollama stop llama3.2
```
If you're using the API, use the `keep_alive` parameter with the `/api/generate` and `/api/chat` endpoints to set the amount of time that a model stays in memory. The `keep_alive` parameter can be set to:
The `keep_alive` parameter can be set to:
* a duration string (such as "10m" or "24h")
* a number in seconds (such as 3600)
* any negative number which will keep the model loaded in memory (e.g. -1 or "-1m")
@@ -251,17 +245,17 @@ If you're using the API, use the `keep_alive` parameter with the `/api/generate`
For example, to preload a model and leave it in memory use:
```shell
curl http://localhost:11434/api/generate -d '{"model": "llama3.2", "keep_alive": -1}'
curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}'
```
To unload the model and free up memory use:
```shell
curl http://localhost:11434/api/generate -d '{"model": "llama3.2", "keep_alive": 0}'
curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}'
```
Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to the section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
The `keep_alive` API parameter with the `/api/generate` and `/api/chat` API endpoints will override the `OLLAMA_KEEP_ALIVE` setting.
If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` API parameter with the `/api/generate` or `/api/chat` API endpoints.
## How do I manage the maximum number of requests the Ollama server can queue?

View File

@@ -10,7 +10,7 @@ Check your compute compatibility to see if your card is supported:
| 9.0 | NVIDIA | `H100` |
| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060` |
| | NVIDIA Professional | `L4` `L40` `RTX 6000` |
| 8.6 | GeForce RTX 30xx | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` `RTX 3050 Ti` `RTX 3050` |
| 8.6 | GeForce RTX 30xx | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` |
| | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2` |
| 8.0 | NVIDIA | `A100` `A30` |
| 7.5 | GeForce GTX/RTX | `GTX 1650 Ti` `TITAN RTX` `RTX 2080 Ti` `RTX 2080` `RTX 2070` `RTX 2060` |

View File

Binary file not shown.

Before

Width:  |  Height:  |  Size: 150 KiB

After

Width:  |  Height:  |  Size: 141 KiB

View File

@@ -38,7 +38,7 @@ Ollama supports importing adapters based on several different model architecture
You can create the adapter using a fine tuning framework or tool which can output adapters in the Safetensors format, such as:
* Hugging Face [fine tuning framework](https://huggingface.co/docs/transformers/en/training)
* Hugging Face [fine tuning framework] (https://huggingface.co/docs/transformers/en/training)
* [Unsloth](https://github.com/unslothai/unsloth)
* [MLX](https://github.com/ml-explore/mlx)
@@ -158,7 +158,7 @@ You can share any model you have created by pushing it to [ollama.com](https://o
First, use your browser to go to the [Ollama Sign-Up](https://ollama.com/signup) page. If you already have an account, you can skip this step.
<img src="images/signup.png" alt="Sign-Up" width="40%">
![Sign-Up](images/signup.png)
The `Username` field will be used as part of your model's name (e.g. `jmorganca/mymodel`), so make sure you are comfortable with the username that you have selected.
@@ -166,7 +166,7 @@ Now that you have created an account and are signed-in, go to the [Ollama Keys S
Follow the directions on the page to determine where your Ollama Public Key is located.
<img src="images/ollama-keys.png" alt="Ollama Keys" width="80%">
![Ollama Key](images/ollama-keys.png)
Click on the `Add Ollama Public Key` button, and copy and paste the contents of your Ollama Public Key into the text field.

View File

@@ -1,59 +1,39 @@
# Linux
# Ollama on Linux
## Install
To install Ollama, run the following command:
Install Ollama running this one-liner:
```shell
>
```bash
curl -fsSL https://ollama.com/install.sh | sh
```
## AMD Radeon GPU support
While AMD has contributed the `amdgpu` driver upstream to the official linux
kernel source, the version is older and may not support all ROCm features. We
recommend you install the latest driver from
https://www.amd.com/en/support/linux-drivers for best support of your Radeon
GPU.
## Manual install
Download and extract the package:
### Download `ollama`
```shell
curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
sudo tar -C /usr -xzf ollama-linux-amd64.tgz
```
Download and extract the Linux package:
Start Ollama:
```shell
ollama serve
```
In another terminal, verify that Ollama is running:
```shell
ollama -v
```
### AMD GPU install
If you have an AMD GPU, also download and extract the additional ROCm package:
```shell
curl -L https://ollama.com/download/ollama-linux-amd64-rocm.tgz -o ollama-linux-amd64-rocm.tgz
sudo tar -C /usr -xzf ollama-linux-amd64-rocm.tgz
```
### ARM64 install
Download and extract the ARM64-specific package:
```shell
curl -L https://ollama.com/download/ollama-linux-arm64.tgz -o ollama-linux-arm64.tgz
sudo tar -C /usr -xzf ollama-linux-arm64.tgz
```bash
curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
```
### Adding Ollama as a startup service (recommended)
Create a user and group for Ollama:
Create a user for Ollama:
```shell
sudo useradd -r -s /bin/false -U -m -d /usr/share/ollama ollama
sudo usermod -a -G ollama $(whoami)
```bash
sudo useradd -r -s /bin/false -m -d /usr/share/ollama ollama
```
Create a service file in `/etc/systemd/system/ollama.service`:
@@ -69,7 +49,6 @@ User=ollama
Group=ollama
Restart=always
RestartSec=3
Environment="PATH=$PATH"
[Install]
WantedBy=default.target
@@ -77,54 +56,46 @@ WantedBy=default.target
Then start the service:
```shell
```bash
sudo systemctl daemon-reload
sudo systemctl enable ollama
```
### Install CUDA drivers (optional)
### Install CUDA drivers (optional for Nvidia GPUs)
[Download and install](https://developer.nvidia.com/cuda-downloads) CUDA.
Verify that the drivers are installed by running the following command, which should print details about your GPU:
```shell
```bash
nvidia-smi
```
### Install AMD ROCm drivers (optional)
### Install ROCm (optional - for Radeon GPUs)
[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html)
[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html) ROCm v6.
Make sure to install ROCm v6
### Start Ollama
Start Ollama and verify it is running:
Start Ollama using `systemd`:
```shell
```bash
sudo systemctl start ollama
sudo systemctl status ollama
```
> [!NOTE]
> While AMD has contributed the `amdgpu` driver upstream to the official linux
> kernel source, the version is older and may not support all ROCm features. We
> recommend you install the latest driver from
> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
> GPU.
## Update
## Updating
Update ollama by running the install script again:
Update Ollama by running the install script again:
```shell
```bash
curl -fsSL https://ollama.com/install.sh | sh
```
Or by re-downloading Ollama:
Or by downloading the ollama binary:
```shell
curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
sudo tar -C /usr -xzf ollama-linux-amd64.tgz
```bash
curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
```
## Installing specific versions
@@ -133,15 +104,15 @@ Use `OLLAMA_VERSION` environment variable with the install script to install a s
For example:
```shell
curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.3.9 sh
```
curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.1.32 sh
```
## Viewing logs
To view logs of Ollama running as a startup service, run:
```shell
```bash
journalctl -e -u ollama
```
@@ -149,7 +120,7 @@ journalctl -e -u ollama
Remove the ollama service:
```shell
```bash
sudo systemctl stop ollama
sudo systemctl disable ollama
sudo rm /etc/systemd/system/ollama.service
@@ -157,13 +128,13 @@ sudo rm /etc/systemd/system/ollama.service
Remove the ollama binary from your bin directory (either `/usr/local/bin`, `/usr/bin`, or `/bin`):
```shell
```bash
sudo rm $(which ollama)
```
Remove the downloaded models and Ollama service user and group:
```shell
```bash
sudo rm -r /usr/share/ollama
sudo userdel ollama
sudo groupdel ollama

View File

@@ -11,9 +11,8 @@ A model file is the blueprint to create and share models with Ollama.
- [Examples](#examples)
- [Instructions](#instructions)
- [FROM (Required)](#from-required)
- [Build from existing model](#build-from-existing-model)
- [Build from a Safetensors model](#build-from-a-safetensors-model)
- [Build from a GGUF file](#build-from-a-gguf-file)
- [Build from llama3](#build-from-llama3)
- [Build from a bin file](#build-from-a-bin-file)
- [PARAMETER](#parameter)
- [Valid Parameters and Values](#valid-parameters-and-values)
- [TEMPLATE](#template)
@@ -50,7 +49,7 @@ INSTRUCTION arguments
An example of a `Modelfile` creating a mario blueprint:
```modelfile
FROM llama3.2
FROM llama3
# sets the temperature to 1 [higher is more creative, lower is more coherent]
PARAMETER temperature 1
# sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
@@ -72,10 +71,10 @@ More examples are available in the [examples directory](../examples).
To view the Modelfile of a given model, use the `ollama show --modelfile` command.
```bash
> ollama show --modelfile llama3.2
> ollama show --modelfile llama3
# Modelfile generated by "ollama show"
# To build a new Modelfile based on this one, replace the FROM line with:
# FROM llama3.2:latest
# FROM llama3:latest
FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
@@ -100,39 +99,22 @@ The `FROM` instruction defines the base model to use when creating a model.
FROM <model name>:<tag>
```
#### Build from existing model
#### Build from llama3
```modelfile
FROM llama3.2
FROM llama3
```
A list of available base models:
<https://github.com/ollama/ollama#model-library>
Additional models can be found at:
<https://ollama.com/library>
#### Build from a Safetensors model
#### Build from a `bin` file
```modelfile
FROM <model directory>
FROM ./ollama-model.bin
```
The model directory should contain the Safetensors weights for a supported architecture.
Currently supported model architectures:
* Llama (including Llama 2, Llama 3, and Llama 3.1)
* Mistral (including Mistral 1, Mistral 2, and Mixtral)
* Gemma (including Gemma 1 and Gemma 2)
* Phi3
#### Build from a GGUF file
```modelfile
FROM ./ollama-model.gguf
```
The GGUF file location should be specified as an absolute path or relative to the `Modelfile` location.
This bin file location should be specified as an absolute path or relative to the `Modelfile` location.
### PARAMETER
@@ -192,23 +174,10 @@ SYSTEM """<system message>"""
### ADAPTER
The `ADAPTER` instruction specifies a fine tuned LoRA adapter that should apply to the base model. The value of the adapter should be an absolute path or a path relative to the Modelfile. The base model should be specified with a `FROM` instruction. If the base model is not the same as the base model that the adapter was tuned from the behaviour will be erratic.
#### Safetensor adapter
The `ADAPTER` instruction is an optional instruction that specifies any LoRA adapter that should apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.
```modelfile
ADAPTER <path to safetensor adapter>
```
Currently supported Safetensor adapters:
* Llama (including Llama 2, Llama 3, and Llama 3.1)
* Mistral (including Mistral 1, Mistral 2, and Mixtral)
* Gemma (including Gemma 1 and Gemma 2)
#### GGUF adapter
```modelfile
ADAPTER ./ollama-lora.gguf
ADAPTER ./ollama-lora.bin
```
### LICENSE

View File

@@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
'content': 'Say this is a test',
}
],
model='llama3.2',
model='llama3',
)
response = client.chat.completions.create(
@@ -46,13 +46,13 @@ response = client.chat.completions.create(
)
completion = client.completions.create(
model="llama3.2",
model="llama3",
prompt="Say this is a test",
)
list_completion = client.models.list()
model = client.models.retrieve("llama3.2")
model = client.models.retrieve("llama3")
embeddings = client.embeddings.create(
model="all-minilm",
@@ -74,7 +74,7 @@ const openai = new OpenAI({
const chatCompletion = await openai.chat.completions.create({
messages: [{ role: 'user', content: 'Say this is a test' }],
model: 'llama3.2',
model: 'llama3',
})
const response = await openai.chat.completions.create({
@@ -94,13 +94,13 @@ const response = await openai.chat.completions.create({
})
const completion = await openai.completions.create({
model: "llama3.2",
model: "llama3",
prompt: "Say this is a test.",
})
const listCompletion = await openai.models.list()
const model = await openai.models.retrieve("llama3.2")
const model = await openai.models.retrieve("llama3")
const embedding = await openai.embeddings.create({
model: "all-minilm",
@@ -114,7 +114,7 @@ const embedding = await openai.embeddings.create({
curl http://localhost:11434/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama3.2",
"model": "llama3",
"messages": [
{
"role": "system",
@@ -154,13 +154,13 @@ curl http://localhost:11434/v1/chat/completions \
curl http://localhost:11434/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama3.2",
"model": "llama3",
"prompt": "Say this is a test"
}'
curl http://localhost:11434/v1/models
curl http://localhost:11434/v1/models/llama3.2
curl http://localhost:11434/v1/models/llama3
curl http://localhost:11434/v1/embeddings \
-H "Content-Type: application/json" \
@@ -274,7 +274,7 @@ curl http://localhost:11434/v1/embeddings \
Before using a model, pull it locally `ollama pull`:
```shell
ollama pull llama3.2
ollama pull llama3
```
### Default model names
@@ -282,7 +282,7 @@ ollama pull llama3.2
For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
```
ollama cp llama3.2 gpt-3.5-turbo
ollama cp llama3 gpt-3.5-turbo
```
Afterwards, this new model name can be specified the `model` field:
@@ -300,28 +300,3 @@ curl http://localhost:11434/v1/chat/completions \
]
}'
```
### Setting the context size
The OpenAI API does not have a way of setting the context size for a model. If you need to change the context size, create a `Modelfile` which looks like:
```modelfile
FROM <some model>
PARAMETER num_ctx <context size>
```
Use the `ollama create mymodel` command to create a new model with the updated context size. Call the API with the updated model name:
```shell
curl http://localhost:11434/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "mymodel",
"messages": [
{
"role": "user",
"content": "Hello!"
}
]
}'
```

View File

@@ -33,7 +33,7 @@ Omitting a template in these models puts the responsibility of correctly templat
To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3.
```dockerfile
FROM llama3.2
FROM llama3
TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>

View File

@@ -91,17 +91,6 @@ If none of those resolve the problem, gather additional information and file an
- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
## AMD GPU Discovery
On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device. If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.
If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries. This can help show more detailed error codes that can help troubleshoot problems
- `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported
- Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd`
## Windows Terminal Errors
Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly. This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect` To resolve this problem, please update to Win 10 22H1 or newer.

View File

@@ -15,7 +15,7 @@ import { Ollama } from "@langchain/community/llms/ollama";
const ollama = new Ollama({
baseUrl: "http://localhost:11434",
model: "llama3.2",
model: "llama3.1",
});
const answer = await ollama.invoke(`why is the sky blue?`);
@@ -23,7 +23,7 @@ const answer = await ollama.invoke(`why is the sky blue?`);
console.log(answer);
```
That will get us the same thing as if we ran `ollama run llama3.2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
That will get us the same thing as if we ran `ollama run llama3.1 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
```bash
npm install cheerio

View File

@@ -29,7 +29,7 @@ Ollama uses unicode characters for progress indication, which may render as unkn
Here's a quick example showing API access from `powershell`
```powershell
(Invoke-WebRequest -method POST -Body '{"model":"llama3.2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
(Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
```
## Troubleshooting
@@ -48,9 +48,6 @@ the explorer window by hitting `<cmd>+R` and type in:
- `explorer %HOMEPATH%\.ollama` contains models and configuration
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
## Uninstall
The Ollama Windows installer registers an Uninstaller application. Under `Add or remove programs` in Windows Settings, you can uninstall Ollama.
## Standalone CLI

View File

@@ -30,7 +30,9 @@ func Host() *url.URL {
defaultPort = "443"
}
hostport, path, _ := strings.Cut(hostport, "/")
// trim trailing slashes
hostport = strings.TrimRight(hostport, "/")
host, port, err := net.SplitHostPort(hostport)
if err != nil {
host, port = "127.0.0.1", defaultPort
@@ -43,13 +45,15 @@ func Host() *url.URL {
if n, err := strconv.ParseInt(port, 10, 32); err != nil || n > 65535 || n < 0 {
slog.Warn("invalid port, using default", "port", port, "default", defaultPort)
port = defaultPort
return &url.URL{
Scheme: scheme,
Host: net.JoinHostPort(host, defaultPort),
}
}
return &url.URL{
Scheme: scheme,
Host: net.JoinHostPort(host, port),
Path: path,
}
}
@@ -112,26 +116,6 @@ func KeepAlive() (keepAlive time.Duration) {
return keepAlive
}
// LoadTimeout returns the duration for stall detection during model loads. LoadTimeout can be configured via the OLLAMA_LOAD_TIMEOUT environment variable.
// Zero or Negative values are treated as infinite.
// Default is 5 minutes.
func LoadTimeout() (loadTimeout time.Duration) {
loadTimeout = 5 * time.Minute
if s := Var("OLLAMA_LOAD_TIMEOUT"); s != "" {
if d, err := time.ParseDuration(s); err == nil {
loadTimeout = d
} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
loadTimeout = time.Duration(n) * time.Second
}
}
if loadTimeout <= 0 {
return time.Duration(math.MaxInt64)
}
return loadTimeout
}
func Bool(k string) func() bool {
return func() bool {
if s := Var(k); s != "" {
@@ -179,6 +163,53 @@ var (
HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
)
func RunnersDir() (p string) {
if p := Var("OLLAMA_RUNNERS_DIR"); p != "" {
return p
}
if runtime.GOOS != "windows" {
return
}
defer func() {
if p == "" {
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
}
}()
// On Windows we do not carry the payloads inside the main executable
exe, err := os.Executable()
if err != nil {
return
}
cwd, err := os.Getwd()
if err != nil {
return
}
var paths []string
for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), ".."), cwd} {
paths = append(paths,
root,
filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
)
}
// Try a few variations to improve developer experience when building from source in the local tree
for _, path := range paths {
candidate := filepath.Join(path, "lib", "ollama", "runners")
if _, err := os.Stat(candidate); err == nil {
p = candidate
break
}
}
return p
}
func Uint(key string, defaultValue uint) func() uint {
return func() uint {
if s := Var(key); s != "" {
@@ -204,23 +235,6 @@ var (
MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
)
func Uint64(key string, defaultValue uint64) func() uint64 {
return func() uint64 {
if s := Var(key); s != "" {
if n, err := strconv.ParseUint(s, 10, 64); err != nil {
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
} else {
return n
}
}
return defaultValue
}
}
// Set aside VRAM per GPU
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
type EnvVar struct {
Name string
Value any
@@ -231,11 +245,9 @@ func AsMap() map[string]EnvVar {
ret := map[string]EnvVar{
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
"OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
"OLLAMA_LOAD_TIMEOUT": {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"},
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
@@ -243,22 +255,10 @@ func AsMap() map[string]EnvVar {
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
// Informational
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
"HTTPS_PROXY": {"HTTPS_PROXY", String("HTTPS_PROXY")(), "HTTPS proxy"},
"NO_PROXY": {"NO_PROXY", String("NO_PROXY")(), "No proxy"},
}
if runtime.GOOS != "windows" {
// Windows environment variables are case-insensitive so there's no need to duplicate them
ret["http_proxy"] = EnvVar{"http_proxy", String("http_proxy")(), "HTTP proxy"}
ret["https_proxy"] = EnvVar{"https_proxy", String("https_proxy")(), "HTTPS proxy"}
ret["no_proxy"] = EnvVar{"no_proxy", String("no_proxy")(), "No proxy"}
}
if runtime.GOOS != "darwin" {
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
@@ -267,7 +267,6 @@ func AsMap() map[string]EnvVar {
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
}
return ret
}
@@ -283,12 +282,3 @@ func Values() map[string]string {
func Var(key string) string {
return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
}
// On windows, we keep the binary at the top directory, but
// other platforms use a "bin" directory, so this returns ".."
func LibRelativeToExe() string {
if runtime.GOOS == "windows" {
return "."
}
return ".."
}

View File

@@ -13,35 +13,34 @@ func TestHost(t *testing.T) {
value string
expect string
}{
"empty": {"", "http://127.0.0.1:11434"},
"only address": {"1.2.3.4", "http://1.2.3.4:11434"},
"only port": {":1234", "http://:1234"},
"address and port": {"1.2.3.4:1234", "http://1.2.3.4:1234"},
"hostname": {"example.com", "http://example.com:11434"},
"hostname and port": {"example.com:1234", "http://example.com:1234"},
"zero port": {":0", "http://:0"},
"too large port": {":66000", "http://:11434"},
"too small port": {":-1", "http://:11434"},
"ipv6 localhost": {"[::1]", "http://[::1]:11434"},
"ipv6 world open": {"[::]", "http://[::]:11434"},
"ipv6 no brackets": {"::1", "http://[::1]:11434"},
"ipv6 + port": {"[::1]:1337", "http://[::1]:1337"},
"extra space": {" 1.2.3.4 ", "http://1.2.3.4:11434"},
"extra quotes": {"\"1.2.3.4\"", "http://1.2.3.4:11434"},
"extra space+quotes": {" \" 1.2.3.4 \" ", "http://1.2.3.4:11434"},
"extra single quotes": {"'1.2.3.4'", "http://1.2.3.4:11434"},
"http": {"http://1.2.3.4", "http://1.2.3.4:80"},
"http port": {"http://1.2.3.4:4321", "http://1.2.3.4:4321"},
"https": {"https://1.2.3.4", "https://1.2.3.4:443"},
"https port": {"https://1.2.3.4:4321", "https://1.2.3.4:4321"},
"proxy path": {"https://example.com/ollama", "https://example.com:443/ollama"},
"empty": {"", "127.0.0.1:11434"},
"only address": {"1.2.3.4", "1.2.3.4:11434"},
"only port": {":1234", ":1234"},
"address and port": {"1.2.3.4:1234", "1.2.3.4:1234"},
"hostname": {"example.com", "example.com:11434"},
"hostname and port": {"example.com:1234", "example.com:1234"},
"zero port": {":0", ":0"},
"too large port": {":66000", ":11434"},
"too small port": {":-1", ":11434"},
"ipv6 localhost": {"[::1]", "[::1]:11434"},
"ipv6 world open": {"[::]", "[::]:11434"},
"ipv6 no brackets": {"::1", "[::1]:11434"},
"ipv6 + port": {"[::1]:1337", "[::1]:1337"},
"extra space": {" 1.2.3.4 ", "1.2.3.4:11434"},
"extra quotes": {"\"1.2.3.4\"", "1.2.3.4:11434"},
"extra space+quotes": {" \" 1.2.3.4 \" ", "1.2.3.4:11434"},
"extra single quotes": {"'1.2.3.4'", "1.2.3.4:11434"},
"http": {"http://1.2.3.4", "1.2.3.4:80"},
"http port": {"http://1.2.3.4:4321", "1.2.3.4:4321"},
"https": {"https://1.2.3.4", "1.2.3.4:443"},
"https port": {"https://1.2.3.4:4321", "1.2.3.4:4321"},
}
for name, tt := range cases {
t.Run(name, func(t *testing.T) {
t.Setenv("OLLAMA_HOST", tt.value)
if host := Host(); host.String() != tt.expect {
t.Errorf("%s: expected %s, got %s", name, tt.expect, host.String())
if host := Host(); host.Host != tt.expect {
t.Errorf("%s: expected %s, got %s", name, tt.expect, host.Host)
}
})
}
@@ -215,40 +214,6 @@ func TestKeepAlive(t *testing.T) {
}
}
func TestLoadTimeout(t *testing.T) {
defaultTimeout := 5 * time.Minute
cases := map[string]time.Duration{
"": defaultTimeout,
"1s": time.Second,
"1m": time.Minute,
"1h": time.Hour,
"5m0s": defaultTimeout,
"1h2m3s": 1*time.Hour + 2*time.Minute + 3*time.Second,
"0": time.Duration(math.MaxInt64),
"60": 60 * time.Second,
"120": 2 * time.Minute,
"3600": time.Hour,
"-0": time.Duration(math.MaxInt64),
"-1": time.Duration(math.MaxInt64),
"-1m": time.Duration(math.MaxInt64),
// invalid values
" ": defaultTimeout,
"???": defaultTimeout,
"1d": defaultTimeout,
"1y": defaultTimeout,
"1w": defaultTimeout,
}
for tt, expect := range cases {
t.Run(tt, func(t *testing.T) {
t.Setenv("OLLAMA_LOAD_TIMEOUT", tt)
if actual := LoadTimeout(); actual != expect {
t.Errorf("%s: expected %s, got %s", tt, expect, actual)
}
})
}
}
func TestVar(t *testing.T) {
cases := map[string]string{
"value": "value",

View File

@@ -35,7 +35,7 @@ func main() {
ctx := context.Background()
req := &api.ChatRequest{
Model: "llama3.2",
Model: "llama3.1",
Messages: messages,
}

View File

@@ -4,10 +4,10 @@ This example provides an interface for asking questions to a PDF document.
## Setup
1. Ensure you have the `llama3.2` model installed:
1. Ensure you have the `llama3.1` model installed:
```
ollama pull llama3.2
ollama pull llama3.1
```
2. Install the Python Requirements.

View File

@@ -51,7 +51,7 @@ while True:
template=template,
)
llm = Ollama(model="llama3.2", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
llm = Ollama(model="llama3.1", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
qa_chain = RetrievalQA.from_chain_type(
llm,
retriever=vectorstore.as_retriever(),

View File

@@ -1,6 +1,6 @@
langchain==0.0.274
gpt4all==1.0.8
chromadb==0.5.0
chromadb==0.4.7
llama-cpp-python==0.1.81
urllib3==2.0.4
PyMuPDF==1.23.5
@@ -12,4 +12,4 @@ pandoc==2.3
pypandoc==1.11
tqdm==4.66.1
sentence_transformers==2.2.2
numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability
numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability

View File

@@ -4,10 +4,10 @@ This example summarizes the website, [https://ollama.com/blog/run-llama2-uncenso
## Running the Example
1. Ensure you have the `llama3.2` model installed:
1. Ensure you have the `llama3.1` model installed:
```bash
ollama pull llama3.2
ollama pull llama3.1
```
2. Install the Python Requirements.

View File

@@ -5,7 +5,7 @@ from langchain.chains.summarize import load_summarize_chain
loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
docs = loader.load()
llm = Ollama(model="llama3.2")
llm = Ollama(model="llama3.1")
chain = load_summarize_chain(llm, chain_type="stuff")
result = chain.invoke(docs)

View File

@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.
## Running the Example
1. Ensure you have the `llama3.2` model installed:
1. Ensure you have the `llama3.1` model installed:
```bash
ollama pull llama3.2
ollama pull llama3.1
```
2. Install the Python Requirements.

View File

@@ -1,6 +1,6 @@
from langchain.llms import Ollama
input = input("What is your question?")
llm = Ollama(model="llama3.2")
llm = Ollama(model="llama3.1")
res = llm.predict(input)
print (res)

View File

@@ -1,4 +1,4 @@
FROM llama3.2
FROM llama3.1
PARAMETER temperature 1
SYSTEM """
You are Mario from super mario bros, acting as an assistant.

View File

@@ -2,12 +2,12 @@
# Example character: Mario
This example shows how to create a basic character using Llama 3.2 as the base model.
This example shows how to create a basic character using Llama3.1 as the base model.
To run this example:
1. Download the Modelfile
2. `ollama pull llama3.2` to get the base model used in the model file.
2. `ollama pull llama3.1` to get the base model used in the model file.
3. `ollama create NAME -f ./Modelfile`
4. `ollama run NAME`
@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
What the model file looks like:
```
FROM llama3.2
FROM llama3.1
PARAMETER temperature 1
SYSTEM """
You are Mario from Super Mario Bros, acting as an assistant.

View File

@@ -1,93 +0,0 @@
# RAG Hallucination Checker using Bespoke-Minicheck
This example allows the user to ask questions related to a document, which can be specified via an article url. Relevant chunks are retreived from the document and given to `llama3.2` as context to answer the question. Then each sentence in the answer is checked against the retrieved chunks using `bespoke-minicheck` to ensure that the answer does not contain hallucinations.
## Running the Example
1. Ensure `all-minilm` (embedding) `llama3.2` (chat) and `bespoke-minicheck` (check) models installed:
```bash
ollama pull all-minilm
ollama pull llama3.2
ollama pull bespoke-minicheck
```
2. Install the dependencies.
```bash
pip install -r requirements.txt
```
3. Run the example:
```bash
python main.py
```
## Expected Output
```text
Enter the URL of an article you want to chat with, or press Enter for default example:
Loaded, chunked, and embedded text from https://www.theverge.com/2024/9/12/24242439/openai-o1-model-reasoning-strawberry-chatgpt.
Enter your question or type quit: Who is the CEO of openai?
Retrieved chunks:
OpenAI is releasing a new model called o1 , the first in a planned series of “ reasoning ” models that have been trained to answer more complex questions , faster than a human can . It s being released alongside o1-mini , a smaller , cheaper version . And yes , if you re steeped in AI rumors : this is , in fact , the extremely hyped Strawberry model . For OpenAI , o1 represents a step toward its broader goal of human-like artificial intelligence .
OpenAI is releasing a new model called o1 , the first in a planned series of “ reasoning ” models that have been trained to answer more complex questions , faster than a human can . It s being released alongside o1-mini , a smaller , cheaper version . And yes , if you re steeped in AI rumors : this is , in fact , the extremely hyped Strawberry model . For OpenAI , o1 represents a step toward its broader goal of human-like artificial intelligence . More practically , it does a better job at writing code and solving multistep problems than previous models . But it s also more expensive and slower to use than GPT-4o . OpenAI is calling this release of o1 a “ preview ” to emphasize how nascent it is . ChatGPT Plus and Team users get access to both o1-preview and o1-mini starting today , while Enterprise and Edu users will get access early next week .
More practically , it does a better job at writing code and solving multistep problems than previous models . But it s also more expensive and slower to use than GPT-4o . OpenAI is calling this release of o1 a “ preview ” to emphasize how nascent it is . ChatGPT Plus and Team users get access to both o1-preview and o1-mini starting today , while Enterprise and Edu users will get access early next week . OpenAI says it plans to bring o1-mini access to all the free users of ChatGPT but hasn t set a release date yet . Developer access to o1 is really expensive : In the API , o1-preview is $ 15 per 1 million input tokens , or chunks of text parsed by the model , and $ 60 per 1 million output tokens . For comparison , GPT-4o costs $ 5 per 1 million input tokens and $ 15 per 1 million output tokens .
OpenAI says it plans to bring o1-mini access to all the free users of ChatGPT but hasn t set a release date yet . Developer access to o1 is really expensive : In the API , o1-preview is $ 15 per 1 million input tokens , or chunks of text parsed by the model , and $ 60 per 1 million output tokens . For comparison , GPT-4o costs $ 5 per 1 million input tokens and $ 15 per 1 million output tokens . The training behind o1 is fundamentally different from its predecessors , OpenAI s research lead , Jerry Tworek , tells me , though the company is being vague about the exact details . He says o1 “ has been trained using a completely new optimization algorithm and a new training dataset specifically tailored for it. ” Image : OpenAI OpenAI taught previous GPT models to mimic patterns from its training data .
LLM Answer:
The text does not mention the CEO of OpenAI. It only discusses the release of a new model called o1 and some details about it, but does not provide information on the company's leadership.
LLM Claim: The text does not mention the CEO of OpenAI.
Is this claim supported by the context according to bespoke-minicheck? Yes
LLM Claim: It only discusses the release of a new model called o1 and some details about it, but does not provide information on the company's leadership.
Is this claim supported by the context according to bespoke-minicheck? No
```
The second claim is unsupported since the text mentions the research lead.
Another tricky example:
```text
Enter your question or type quit: what sets o1 apart from gpt-4o?
Retrieved chunks:
OpenAI says it plans to bring o1-mini access to all the free users of ChatGPT but hasn t set a release date yet . Developer access to o1 is really expensive : In the API , o1-preview is $ 15 per 1 million input tokens , or chunks of text parsed by the model , and $ 60 per 1 million output tokens . For comparison , GPT-4o costs $ 5 per 1 million input tokens and $ 15 per 1 million output tokens . The training behind o1 is fundamentally different from its predecessors , OpenAI s research lead , Jerry Tworek , tells me , though the company is being vague about the exact details . He says o1 “ has been trained using a completely new optimization algorithm and a new training dataset specifically tailored for it. ” Image : OpenAI OpenAI taught previous GPT models to mimic patterns from its training data .
He says OpenAI also tested o1 against a qualifying exam for the International Mathematics Olympiad , and while GPT-4o only correctly solved only 13 percent of problems , o1 scored 83 percent . “ We can t say we solved hallucinations ” In online programming contests known as Codeforces competitions , this new model reached the 89th percentile of participants , and OpenAI claims the next update of this model will perform “ similarly to PhD students on challenging benchmark tasks in physics , chemistry and biology. ” At the same time , o1 is not as capable as GPT-4o in a lot of areas . It doesn t do as well on factual knowledge about the world .
More practically , it does a better job at writing code and solving multistep problems than previous models . But it s also more expensive and slower to use than GPT-4o . OpenAI is calling this release of o1 a “ preview ” to emphasize how nascent it is . ChatGPT Plus and Team users get access to both o1-preview and o1-mini starting today , while Enterprise and Edu users will get access early next week . OpenAI says it plans to bring o1-mini access to all the free users of ChatGPT but hasn t set a release date yet . Developer access to o1 is really expensive : In the API , o1-preview is $ 15 per 1 million input tokens , or chunks of text parsed by the model , and $ 60 per 1 million output tokens . For comparison , GPT-4o costs $ 5 per 1 million input tokens and $ 15 per 1 million output tokens .
“ We can t say we solved hallucinations ” In online programming contests known as Codeforces competitions , this new model reached the 89th percentile of participants , and OpenAI claims the next update of this model will perform “ similarly to PhD students on challenging benchmark tasks in physics , chemistry and biology. ” At the same time , o1 is not as capable as GPT-4o in a lot of areas . It doesn t do as well on factual knowledge about the world . It also doesn t have the ability to browse the web or process files and images . Still , the company believes it represents a brand-new class of capabilities . It was named o1 to indicate “ resetting the counter back to 1. ” “ I m gon na be honest : I think we re terrible at naming , traditionally , ” McGrew says .
LLM Answer: According to the text, several things set o1 apart from GPT-4o:
* In online programming contests (Codeforces competitions), o1 scored 83% correct solutions compared to GPT-4o's 13%.
* The training behind o1 is "fundamentally different" from its predecessors, including a completely new optimization algorithm and a new training dataset specifically tailored for it.
* o1 has been shown to perform similarly to PhD students on challenging benchmark tasks in physics, chemistry, and biology, while GPT-4o does not have this level of performance.
* However, o1 is also less capable than GPT-4o in some areas, particularly with regard to factual knowledge about the world.
LLM Claim: According to the text, several things set o1 apart from GPT-4o:
* In online programming contests (Codeforces competitions), o1 scored 83% correct solutions compared to GPT-4o's 13%.
Is this claim supported by the context according to bespoke-minicheck? Yes
LLM Claim: * The training behind o1 is "fundamentally different" from its predecessors, including a completely new optimization algorithm and a new training dataset specifically tailored for it.
Is this claim supported by the context according to bespoke-minicheck? Yes
LLM Claim: * o1 has been shown to perform similarly to PhD students on challenging benchmark tasks in physics, chemistry, and biology, while GPT-4o does not have this level of performance.
Is this claim supported by the context according to bespoke-minicheck? No
LLM Claim: * However, o1 is also less capable than GPT-4o in some areas, particularly with regard to factual knowledge about the world.
Is this claim supported by the context according to bespoke-minicheck? Yes
```
We see that the third claim "* o1 has been shown to perform similarly to PhD students on challenging benchmark tasks in physics, chemistry, and biology, while GPT-4o does not have this level of performance." is not supported by the context. This is because the context only mentions that o1 "is claimed to perform" which is different from "has been shown to perform".

View File

@@ -1,137 +0,0 @@
import ollama
import warnings
from mattsollamatools import chunker
from newspaper import Article
import numpy as np
from sklearn.neighbors import NearestNeighbors
import nltk
warnings.filterwarnings(
"ignore", category=FutureWarning, module="transformers.tokenization_utils_base"
)
nltk.download("punkt_tab", quiet=True)
def getArticleText(url):
"""Gets the text of an article from a URL.
Often there are a bunch of ads and menus on pages for a news article.
This uses newspaper3k to get just the text of just the article.
"""
article = Article(url)
article.download()
article.parse()
return article.text
def knn_search(question_embedding, embeddings, k=5):
"""Performs K-nearest neighbors (KNN) search"""
X = np.array(
[item["embedding"] for article in embeddings for item in article["embeddings"]]
)
source_texts = [
item["source"] for article in embeddings for item in article["embeddings"]
]
# Fit a KNN model on the embeddings
knn = NearestNeighbors(n_neighbors=k, metric="cosine")
knn.fit(X)
# Find the indices and distances of the k-nearest neighbors.
_, indices = knn.kneighbors(question_embedding, n_neighbors=k)
# Get the indices and source texts of the best matches
best_matches = [(indices[0][i], source_texts[indices[0][i]]) for i in range(k)]
return best_matches
def check(document, claim):
"""Checks if the claim is supported by the document by calling bespoke-minicheck.
Returns Yes/yes if the claim is supported by the document, No/no otherwise.
Support for logits will be added in the future.
bespoke-minicheck's system prompt is defined as:
'Determine whether the provided claim is consistent with the corresponding
document. Consistency in this context implies that all information presented in the claim
is substantiated by the document. If not, it should be considered inconsistent. Please
assess the claim's consistency with the document by responding with either "Yes" or "No".'
bespoke-minicheck's user prompt is defined as:
"Document: {document}\nClaim: {claim}"
"""
prompt = f"Document: {document}\nClaim: {claim}"
response = ollama.generate(
model="bespoke-minicheck", prompt=prompt, options={"num_predict": 2, "temperature": 0.0}
)
return response["response"].strip()
if __name__ == "__main__":
allEmbeddings = []
default_url = "https://www.theverge.com/2024/9/12/24242439/openai-o1-model-reasoning-strawberry-chatgpt"
user_input = input(
"Enter the URL of an article you want to chat with, or press Enter for default example: "
)
article_url = user_input.strip() if user_input.strip() else default_url
article = {}
article["embeddings"] = []
article["url"] = article_url
text = getArticleText(article_url)
chunks = chunker(text)
# Embed (batch) chunks using ollama
embeddings = ollama.embed(model="all-minilm", input=chunks)["embeddings"]
for chunk, embedding in zip(chunks, embeddings):
item = {}
item["source"] = chunk
item["embedding"] = embedding
item["sourcelength"] = len(chunk)
article["embeddings"].append(item)
allEmbeddings.append(article)
print(f"\nLoaded, chunked, and embedded text from {article_url}.\n")
while True:
# Input a question from the user
# For example, "Who is the chief research officer?"
question = input("Enter your question or type quit: ")
if question.lower() == "quit":
break
# Embed the user's question using ollama.embed
question_embedding = ollama.embed(model="all-minilm", input=question)[
"embeddings"
]
# Perform KNN search to find the best matches (indices and source text)
best_matches = knn_search(question_embedding, allEmbeddings, k=4)
sourcetext = "\n\n".join([source_text for (_, source_text) in best_matches])
print(f"\nRetrieved chunks: \n{sourcetext}\n")
# Give the retreived chunks and question to the chat model
system_prompt = f"Only use the following information to answer the question. Do not use anything else: {sourcetext}"
ollama_response = ollama.generate(
model="llama3.2",
prompt=question,
system=system_prompt,
options={"stream": False},
)
answer = ollama_response["response"]
print(f"LLM Answer:\n{answer}\n")
# Check each sentence in the response for grounded factuality
if answer:
for claim in nltk.sent_tokenize(answer):
print(f"LLM Claim: {claim}")
print(
f"Is this claim supported by the context according to bespoke-minicheck? {check(sourcetext, claim)}\n"
)

View File

@@ -1,8 +0,0 @@
ollama
lxml==5.3.0
lxml_html_clean==0.2.2
mattsollamatools==0.0.25
newspaper3k==0.2.8
nltk==3.9.1
numpy==1.26.4
scikit-learn==1.5.2

View File

@@ -1,53 +0,0 @@
"""Simple example to demonstrate how to use the bespoke-minicheck model."""
import ollama
# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
def check(document, claim):
"""Checks if the claim is supported by the document by calling bespoke-minicheck.
Returns Yes/yes if the claim is supported by the document, No/no otherwise.
Support for logits will be added in the future.
bespoke-minicheck's system prompt is defined as:
'Determine whether the provided claim is consistent with the corresponding
document. Consistency in this context implies that all information presented in the claim
is substantiated by the document. If not, it should be considered inconsistent. Please
assess the claim's consistency with the document by responding with either "Yes" or "No".'
bespoke-minicheck's user prompt is defined as:
"Document: {document}\nClaim: {claim}"
"""
prompt = f"Document: {document}\nClaim: {claim}"
response = ollama.generate(
model="bespoke-minicheck", prompt=prompt, options={"num_predict": 2, "temperature": 0.0}
)
return response["response"].strip()
def get_user_input(prompt):
user_input = input(prompt)
if not user_input:
exit()
print()
return user_input
def main():
while True:
# Get a document from the user (e.g. "Ryan likes running and biking.")
document = get_user_input("Enter a document: ")
# Get a claim from the user (e.g. "Ryan likes to run.")
claim = get_user_input("Enter a claim: ")
# Check if the claim is supported by the document
grounded_factuality_check = check(document, claim)
print(
f"Is the claim supported by the document according to bespoke-minicheck? {grounded_factuality_check}"
)
print("\n\n")
if __name__ == "__main__":
main()

View File

@@ -1,54 +0,0 @@
# Simple Bespoke-Minicheck Example
`bespoke-minicheck` is a model for checking if a claim is supported by a document. It is used through the **generate** endpoint, which is called in this example with a `prompt` that includes the expected formatting of the user input.
## Running the Example
1. Ensure you have the `bespoke-minicheck` model installed:
```bash
ollama pull bespoke-minicheck
```
2. Install the dependencies:
```bash
pip install -r requirements.txt
```
3. Run the program:
```bash
python main.py
```
4. Enter a document and a claim when prompted:
```bash
Enter a document: Roses are red.
Enter a claim: Roses are blue.
```
The claim and document are then given to the `bespoke-minicheck` as inputs, which then generates a response (Yes or No) on whether the claim is supported by the document.
```bash
Is the claim supported by the document according to bespoke-minicheck? No
```
## More Examples
Document ([source](https://en.wikipedia.org/wiki/Apple_I)):
> The Apple Computer 1 (Apple-1[a]), later known predominantly as the Apple I(written with a Roman numeral),[b] is an 8-bit motherboard-only personal computer designed by Steve Wozniak[5][6] and released by the Apple Computer Company (now Apple Inc.) in 1976. The company was initially formed to sell the Apple I its first product and would later become the world's largest technology company.[7] The idea of starting a company and selling the computer came from Wozniak's friend and Apple co-founder Steve Jobs.[8][9] One of the main innovations of the Apple I was that it included video display terminal circuitry on its circuit board, allowing it to connect to a low-cost composite video monitor or television, instead of an expensive computer terminal, compared to most existing computers at the time.
Claim:
>The Apple I is a 16-bit computer.
Expected output:
>Is the claim supported by the document according to bespoke-minicheck? **No**
Claim:
>Apple was originally called the Apple Computer Company.
Expected output:
>Is the claim supported by the document according to bespoke-minicheck? **Yes**

View File

@@ -1 +0,0 @@
ollama

View File

@@ -2,7 +2,7 @@ import requests
import json
import random
model = "llama3.2"
model = "llama3.1"
template = {
"firstName": "",
"lastName": "",

View File

@@ -12,7 +12,7 @@ countries = [
"France",
]
country = random.choice(countries)
model = "llama3.2"
model = "llama3.1"
prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."

View File

@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran
## Running the Example
1. Ensure you have the `llama3.2` model installed:
1. Ensure you have the `llama3.1` model installed:
```bash
ollama pull llama3.2
ollama pull llama3.1
```
2. Install the Python Requirements.

View File

@@ -4,5 +4,5 @@ SYSTEM """
You are a log file analyzer. You will receive a set of lines from a log file for some software application, find the errors and other interesting aspects of the logs, and explain them so a new user can understand what they mean. If there are any steps they can do to resolve them, list the steps in your answer.
"""
PARAMETER temperature 0.3
PARAMETER TEMPERATURE 0.3

View File

@@ -21,8 +21,6 @@ You can try this with the `logtest.logfile` file included in this directory.
2. Install the Python Requirements.
```bash
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
```

View File

@@ -1 +1 @@
Requests>=2.32.3
Requests==2.31.0

View File

@@ -2,7 +2,7 @@ import json
import requests
# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
model = "llama3.2" # TODO: update this for whatever model you wish to use
model = "llama3.1" # TODO: update this for whatever model you wish to use
def chat(messages):

View File

@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam
## Running the Example
1. Ensure you have the `llama3.2` model installed:
1. Ensure you have the `llama3.1` model installed:
```bash
ollama pull llama3.2
ollama pull llama3.1
```
2. Install the Python Requirements.

View File

@@ -1,6 +1,6 @@
import * as readline from "readline";
const model = "llama3.2";
const model = "llama3.1";
type Message = {
role: "assistant" | "user" | "system";
content: string;

View File

@@ -9,8 +9,6 @@ import (
"path/filepath"
"runtime"
"strings"
"github.com/ollama/ollama/envconfig"
)
// Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
@@ -56,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) {
// Installer payload location if we're running the installed binary
exe, err := os.Executable()
if err == nil {
rocmTargetDir := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
rocmTargetDir := filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
if rocmLibUsable(rocmTargetDir) {
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
return rocmTargetDir, nil

View File

@@ -5,7 +5,6 @@ import (
"errors"
"fmt"
"io"
"io/fs"
"log/slog"
"os"
"path/filepath"
@@ -360,10 +359,6 @@ func AMDGetGPUInfo() []RocmGPUInfo {
if len(resp) == 0 {
slog.Info("no compatible amdgpu devices detected")
}
if err := verifyKFDDriverAccess(); err != nil {
slog.Error("amdgpu devices detected but permission problems block access", "error", err)
return nil
}
return resp
}
@@ -460,19 +455,3 @@ func getFreeMemory(usedFile string) (uint64, error) {
}
return usedMemory, nil
}
func verifyKFDDriverAccess() error {
// Verify we have permissions - either running as root, or we have group access to the driver
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0o666)
if err != nil {
if errors.Is(err, fs.ErrPermission) {
return fmt.Errorf("permissions not set up properly. Either run ollama as root, or add you user account to the render group. %w", err)
} else if errors.Is(err, fs.ErrNotExist) {
// Container runtime failure?
return fmt.Errorf("kfd driver not loaded. If running in a container, remember to include '--device /dev/kfd --device /dev/dri'")
}
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
}
fd.Close()
return nil
}

View File

@@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) {
// Installer payload (if we're running from some other location)
localAppData := os.Getenv("LOCALAPPDATA")
appDir := filepath.Join(localAppData, "Programs", "Ollama")
rocmTargetDir := filepath.Join(appDir, envconfig.LibRelativeToExe(), "lib", "ollama")
rocmTargetDir := filepath.Join(appDir, "..", "lib", "ollama")
if rocmLibUsable(rocmTargetDir) {
slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
return rocmTargetDir, nil

148
gpu/assets.go Normal file
View File

@@ -0,0 +1,148 @@
package gpu
import (
"errors"
"fmt"
"log/slog"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"syscall"
"time"
"github.com/ollama/ollama/envconfig"
)
var (
lock sync.Mutex
payloadsDir = ""
)
func PayloadsDir() (string, error) {
lock.Lock()
defer lock.Unlock()
var err error
if payloadsDir == "" {
runnersDir := envconfig.RunnersDir()
if runnersDir != "" {
payloadsDir = runnersDir
return payloadsDir, nil
}
// The remainder only applies on non-windows where we still carry payloads in the main executable
cleanupTmpDirs()
tmpDir := envconfig.TmpDir()
if tmpDir == "" {
tmpDir, err = os.MkdirTemp("", "ollama")
if err != nil {
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
}
} else {
err = os.MkdirAll(tmpDir, 0o755)
if err != nil {
return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err)
}
}
// Track our pid so we can clean up orphaned tmpdirs
n := filepath.Join(tmpDir, "ollama.pid")
if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
return "", fmt.Errorf("failed to write pid file %s: %w", n, err)
}
// We create a distinct subdirectory for payloads within the tmpdir
// This will typically look like /tmp/ollama3208993108/runners on linux
payloadsDir = filepath.Join(tmpDir, "runners")
}
return payloadsDir, nil
}
// Best effort to clean up prior tmpdirs
func cleanupTmpDirs() {
matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid"))
if err != nil {
return
}
for _, match := range matches {
raw, err := os.ReadFile(match)
if errors.Is(err, os.ErrNotExist) {
slog.Debug("not a ollama runtime directory, skipping", "path", match)
continue
} else if err != nil {
slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
continue
}
pid, err := strconv.Atoi(string(raw))
if err != nil {
slog.Warn("invalid pid, skipping", "path", match, "error", err)
continue
}
p, err := os.FindProcess(pid)
if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
slog.Warn("process still running, skipping", "pid", pid, "path", match)
continue
}
if err := os.Remove(match); err != nil {
slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
}
runners := filepath.Join(filepath.Dir(match), "runners")
if err := os.RemoveAll(runners); err != nil {
slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
}
if err := os.Remove(filepath.Dir(match)); err != nil {
slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
}
}
}
func Cleanup() {
lock.Lock()
defer lock.Unlock()
runnersDir := envconfig.RunnersDir()
if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
// We want to fully clean up the tmpdir parent of the payloads dir
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
slog.Debug("cleaning up", "dir", tmpDir)
err := os.RemoveAll(tmpDir)
if err != nil {
// On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
time.Sleep(1000 * time.Millisecond)
err = os.RemoveAll(tmpDir)
if err != nil {
slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
}
}
}
}
func UpdatePath(dir string) {
if runtime.GOOS == "windows" {
tmpDir := filepath.Dir(dir)
pathComponents := strings.Split(os.Getenv("PATH"), ";")
i := 0
for _, comp := range pathComponents {
if strings.EqualFold(comp, dir) {
return
}
// Remove any other prior paths to our temp dir
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
pathComponents[i] = comp
i++
}
}
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
slog.Info("updating", "PATH", newPath)
os.Setenv("PATH", newPath)
}
// linux and darwin rely on rpath
}

View File

@@ -57,7 +57,7 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
}
}
if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
return "v11"
}
return "v12"

View File

@@ -93,9 +93,10 @@ func initCudaHandles() *cudaHandles {
localAppData := os.Getenv("LOCALAPPDATA")
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
}
libDir := LibraryDir()
if libDir != "" {
cudartMgmtPatterns = []string{filepath.Join(libDir, CudartMgmtName)}
tmpDir, _ := PayloadsDir()
if tmpDir != "" {
// TODO - add "payloads" for subprocess
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
@@ -205,16 +206,13 @@ func GetGPUInfo() GpuInfoList {
if err != nil {
slog.Warn("error looking up system memory", "error", err)
}
depPath := LibraryDir()
cpus = []CPUInfo{
{
GpuInfo: GpuInfo{
memInfo: mem,
Library: "cpu",
Variant: cpuCapability.String(),
ID: "0",
DependencyPath: depPath,
memInfo: mem,
Library: "cpu",
Variant: cpuCapability.String(),
ID: "0",
},
},
}
@@ -227,6 +225,8 @@ func GetGPUInfo() GpuInfoList {
return GpuInfoList{cpus[0].GpuInfo}
}
depPath := LibraryDir()
// Load ALL libraries
cHandles = initCudaHandles()
@@ -653,7 +653,7 @@ func LibraryDir() string {
slog.Warn("failed to lookup working directory", "error", err)
}
// Scan for any of our dependeices, and pick first match
for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe()), cwd} {
for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), ".."), cwd} {
libDep := filepath.Join("lib", "ollama")
if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
return filepath.Join(root, libDep)

View File

@@ -2,7 +2,7 @@ set(TARGET ollama_llama_server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_executable(${TARGET} server.cpp utils.hpp httplib.h)
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
@@ -10,6 +10,5 @@ target_compile_definitions(${TARGET} PRIVATE
target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
target_link_options(${TARGET} PRIVATE -municode -Wl,/subsystem:console)
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11)

24596
llm/ext_server/json.hpp vendored Normal file
View File

File diff suppressed because it is too large Load Diff

View File

@@ -262,7 +262,7 @@ struct server_slot {
char buffer[512];
double t_token = t_prompt_processing / n_prompt_tokens_processed;
double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
snprintf(buffer, sizeof(buffer), "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
t_prompt_processing, n_prompt_tokens_processed,
t_token, n_tokens_second);
LOG_DEBUG(buffer, {
@@ -276,7 +276,7 @@ struct server_slot {
t_token = t_token_generation / n_decoded;
n_tokens_second = 1e3 / t_token_generation * n_decoded;
snprintf(buffer, sizeof(buffer), "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
t_token_generation, n_decoded,
t_token, n_tokens_second);
LOG_DEBUG(buffer, {
@@ -288,7 +288,7 @@ struct server_slot {
{"n_tokens_second", n_tokens_second},
});
snprintf(buffer, sizeof(buffer), " total time = %10.2f ms", t_prompt_processing + t_token_generation);
sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
LOG_DEBUG(buffer, {
{"slot_id", id},
{"task_id", task_id},
@@ -425,7 +425,7 @@ struct llama_server_context
n_ctx = llama_n_ctx(ctx);
add_bos_token = llama_add_bos_token(model);
add_bos_token = llama_should_add_bos_token(model);
return true;
}
@@ -913,9 +913,7 @@ struct llama_server_context
slot.sampled = result.tok;
// search stop word and delete it
if (!llama_token_is_eog(model, result.tok))
slot.generated_text += token_str;
slot.generated_text += token_str;
slot.has_next_token = true;
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
@@ -956,36 +954,30 @@ struct llama_server_context
if (!incomplete)
{
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
const std::string str_test = slot.generated_text.substr(pos);
bool is_stop_full = false;
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
if (stop_pos != std::string::npos)
{
is_stop_full = true;
slot.generated_text.erase(
slot.generated_text.begin() + pos + stop_pos,
slot.generated_text.end());
pos = std::min(slot.n_sent_text, slot.generated_text.size());
}
else
{
is_stop_full = false;
stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
}
if (!llama_token_is_eog(model, result.tok)) {
const std::string str_test = slot.generated_text.substr(pos);
bool is_stop_full = false;
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
if (stop_pos != std::string::npos)
{
is_stop_full = true;
slot.generated_text.erase(
slot.generated_text.begin() + pos + stop_pos,
slot.generated_text.end());
pos = std::min(slot.n_sent_text, slot.generated_text.size());
}
else
{
is_stop_full = false;
stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
}
// check if there is any token to predict
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
{
// no send the stop word in the response
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache
}
} else {
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.n_sent_text += result.text_to_send.size();
// check if there is any token to predict
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
{
// no send the stop word in the response
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache
}
if (slot.params.stream)
@@ -1039,7 +1031,7 @@ struct llama_server_context
continue;
}
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
LOG_TEE("Error processing the given image");
return false;
}
@@ -1125,7 +1117,9 @@ struct llama_server_context
{"multimodal", multimodal}
};
res.result_json["content"] = tkn.text_to_send;
if (!llama_token_is_eog(model, tkn.tok)) {
res.result_json["content"] = tkn.text_to_send;
}
if (slot.sparams.n_probs > 0)
{
@@ -1246,17 +1240,6 @@ struct llama_server_context
task.type = TASK_TYPE_COMPLETION;
task.multitask_id = multitask_id;
// Set cross attention state for mllama models
// TODO (jmorganca): this should be provided via the API
// TODO (jmorganca): generalize this beyond mllama models
char arch_str[256];
llama_model_meta_val_str(model, "general.architecture", arch_str, 256);
if (strcmp(arch_str, "mllama") == 0) {
// TODO (jmorganca): this should be passed in via the llama_decode api
// or similar, maybe using the llama_batch struct
// llama_set_cross_attn_state(ctx, (float*)cross_attn_state);
}
// when a completion task's prompt array is not a singleton, we split it into multiple requests
// otherwise, it's a single-prompt task, we actually queue it
// if there's numbers in the prompt array it will be treated as an array of tokens
@@ -2031,7 +2014,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.cpuparams.n_threads);
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
@@ -2304,7 +2287,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true;
break;
}
params.cpuparams.n_threads = std::stoi(argv[i]);
params.n_threads = std::stoi(argv[i]);
}
else if (arg == "--grp-attn-n" || arg == "-gan")
{
@@ -2332,7 +2315,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true;
break;
}
params.cpuparams_batch.n_threads = std::stoi(argv[i]);
params.n_threads_batch = std::stoi(argv[i]);
}
else if (arg == "--threads-http")
{
@@ -2643,11 +2626,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
params.kv_overrides.back().key[0] = 0;
}
postprocess_cpu_params(params.cpuparams, nullptr);
postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
if (invalid_param)
{
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
@@ -2797,8 +2775,8 @@ int main(int argc, char **argv) {
{"commit", LLAMA_COMMIT}});
LOG_INFO("system info", {
{"n_threads", params.cpuparams.n_threads},
{"n_threads_batch", params.cpuparams_batch.n_threads},
{"n_threads", params.n_threads},
{"n_threads_batch", params.n_threads_batch},
{"total_threads", std::thread::hardware_concurrency()},
{"system_info", llama_print_system_info()},
});

View File

@@ -31,7 +31,6 @@ init_vars() {
NO_WHOLE_ARCHIVE=""
GCC_ARCH="-arch ${ARCH}"
DIST_BASE=../../dist/darwin-${GOARCH}/
PAYLOAD_BASE=../../build/darwin/${GOARCH}
;;
"Linux")
LIB_EXT="so"
@@ -41,7 +40,6 @@ init_vars() {
# Cross compiling not supported on linux - Use docker
GCC_ARCH=""
DIST_BASE=../../dist/linux-${GOARCH}/
PAYLOAD_BASE=../../build/linux/${GOARCH}
;;
*)
;;
@@ -49,8 +47,7 @@ init_vars() {
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
fi
GZIP=$(command -v pigz 2>/dev/null || echo "gzip")
RUNNER_BASE="${DIST_BASE}/lib/ollama/runners"
GZIP=$(which pigz 2>/dev/null || echo "gzip")
}
git_module_setup() {
@@ -69,47 +66,40 @@ git_module_setup() {
}
apply_patches() {
# apply temporary patches until fix is upstream
for patch in ../patches/*.patch; do
git -c 'user.name=nobody' -c 'user.email=<>' -C ${LLAMACPP_DIR} am ${patch}
done
# Wire up our CMakefile
if ! grep ollama ${LLAMACPP_DIR}/CMakeLists.txt; then
echo 'add_subdirectory(../ext_server ext_server) # ollama' >>${LLAMACPP_DIR}/CMakeLists.txt
fi
if [ -n "$(ls -A ../patches/*.diff)" ]; then
# apply temporary patches until fix is upstream
for patch in ../patches/*.diff; do
for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
(cd ${LLAMACPP_DIR}; git checkout ${file})
done
done
for patch in ../patches/*.diff; do
(cd ${LLAMACPP_DIR} && git apply ${patch})
done
fi
}
build() {
cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
# remove unnecessary build artifacts
rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
}
dist() {
[ -z "${RUNNER}" ] && exit 1
mkdir -p ${RUNNER_BASE}/${RUNNER}/
for f in ${BUILD_DIR}/bin/* ; do
cp ${f} ${RUNNER_BASE}/${RUNNER}/
done
# check for lib directory
if [ -d ${BUILD_DIR}/lib ]; then
for f in ${BUILD_DIR}/lib/* ; do
cp ${f} ${RUNNER_BASE}/${RUNNER}/
done
fi
}
# Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir
compress() {
[ -z "${RUNNER}" ] && exit 1
echo "Compressing payloads with ${GZIP} to reduce overall binary size..."
rm -rf "${PAYLOAD_BASE}/${RUNNER}/"
mkdir -p "${PAYLOAD_BASE}/${RUNNER}/"
echo "Compressing payloads to reduce overall binary size..."
rm -rf ${BUILD_DIR}/bin/*.gz
for f in ${BUILD_DIR}/bin/* ; do
${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
${GZIP} -n --best -f ${f} &
compress_pids+=" $!"
done
# check for lib directory
if [ -d ${BUILD_DIR}/lib ]; then
for f in ${BUILD_DIR}/lib/* ; do
${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
${GZIP} -n --best -f ${f} &
compress_pids+=" $!"
done
fi
@@ -125,7 +115,7 @@ wait_for_compress() {
install() {
echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT} | grep -v "${BUILD_DIR}/bin/" ); do
for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do
rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
cp -af "${lib}" "${BUILD_DIR}/bin/"
done

View File

@@ -19,7 +19,7 @@ sign() {
fi
}
COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
case "${GOARCH}" in
"amd64")
@@ -39,8 +39,7 @@ case "${GOARCH}" in
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
RUNNER=cpu
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
BUILD_DIR="../build/darwin/${ARCH}/cpu"
echo "Building LCD CPU"
build
sign ${BUILD_DIR}/bin/ollama_llama_server
@@ -52,8 +51,7 @@ case "${GOARCH}" in
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
RUNNER=cpu_avx
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
sign ${BUILD_DIR}/bin/ollama_llama_server
@@ -65,8 +63,7 @@ case "${GOARCH}" in
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
RUNNER=cpu_avx2
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
build
@@ -87,8 +84,7 @@ case "${GOARCH}" in
if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
init_vars
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
RUNNER="metal"
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
BUILD_DIR="../build/darwin/${ARCH}/metal"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
build
sign ${BUILD_DIR}/bin/ollama_llama_server

View File

@@ -79,12 +79,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
init_vars
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
RUNNER="cpu"
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
BUILD_DIR="../build/linux/${ARCH}/cpu"
echo "Building custom CPU"
build
install
dist
compress
else
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
@@ -104,12 +102,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
RUNNER=cpu
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
BUILD_DIR="../build/linux/${ARCH}/cpu"
echo "Building LCD CPU"
build
install
dist
compress
fi
@@ -124,12 +120,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
RUNNER=cpu_avx
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
install
dist
compress
fi
@@ -140,12 +134,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
RUNNER=cpu_avx2
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
build
install
dist
compress
fi
fi
@@ -195,13 +187,11 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
fi
export CUDAFLAGS="-t8"
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
RUNNER=cuda${CUDA_VARIANT}
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
build
install
dist
echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
mkdir -p "${CUDA_DIST_DIR}"
for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
@@ -222,8 +212,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
CC=icx
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
RUNNER=oneapi
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
BUILD_DIR="../build/linux/${ARCH}/oneapi"
ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
@@ -242,7 +231,6 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
install
dist
compress
fi
@@ -271,8 +259,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
echo "Building custom ROCM GPU"
fi
RUNNER=rocm${ROCM_VARIANT}
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
# ROCm dependencies are too large to fit into a unified bundle
ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
# TODO figure out how to disable runpath (rpath)
@@ -282,17 +269,13 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
# copy the ROCM dependencies
mkdir -p "${ROCM_DIST_DIR}"
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${GOARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
cp -a "${dep}"* "${ROCM_DIST_DIR}"
if [ $(readlink -f "${dep}") != "${dep}" ] ; then
cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}"
fi
done
install
dist
compress
fi
cleanup
wait_for_compress
echo "go generate completed. LLM runners: $(cd ${PAYLOAD_BASE}; echo *)"
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"

View File

@@ -19,19 +19,6 @@ function amdGPUs {
function init_vars {
write-host "Checking for cmake..."
get-command cmake
write-host "Checking for ninja..."
$d=(get-command -ea 'silentlycontinue' ninja).path
if ($null -eq $d) {
$MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
$matches=(gci -path $MSVC_INSTALL -r -fi ninja.exe)
if ($matches.count -eq 0) {
throw "Unable to locate ninja"
}
$ninjaDir=($matches[0].FullName | split-path -parent)
$env:PATH="$env:PATH;$ninjaDir"
}
if (!$script:SRC_DIR) {
$script:SRC_DIR = $(resolve-path "..\..\")
}
@@ -96,9 +83,29 @@ function git_module_setup {
}
function apply_patches {
# Wire up our CMakefile
if (!(Select-String -Path "${script:llamacppDir}/CMakeLists.txt" -Pattern 'ollama')) {
Add-Content -Path "${script:llamacppDir}/CMakeLists.txt" -Value 'add_subdirectory(../ext_server ext_server) # ollama'
}
# Apply temporary patches until fix is upstream
foreach ($patch in $(Get-ChildItem "../patches/*.patch")) {
git -c 'user.name=nobody' -c 'user.email=<>' -C "${script:llamacppDir}" am $patch.FullName
$patches = Get-ChildItem "../patches/*.diff"
foreach ($patch in $patches) {
# Extract file paths from the patch file
$filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
$parts = $_ -split ' '
($parts[1] -split '/', 2)[1]
}
# Checkout each file
foreach ($file in $filePaths) {
git -C "${script:llamacppDir}" checkout $file
}
}
# Apply each patch
foreach ($patch in $patches) {
git -C "${script:llamacppDir}" apply $patch.FullName
}
}
@@ -158,7 +165,7 @@ function cleanup {
}
# Checkout each file
foreach ($file in $filePaths) {
foreach ($file in $filePaths) {
git -C "${script:llamacppDir}" checkout $file
}
git -C "${script:llamacppDir}" checkout CMakeLists.txt
@@ -175,12 +182,12 @@ function build_static() {
if ((-not "${env:OLLAMA_SKIP_STATIC_GENERATE}") -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "static"))) {
# GCC build for direct linking into the Go binary
init_vars
# cmake will silently fallback to msvc compilers if gcc isn't in the path, so detect and fail fast
# as we need this to be compiled by gcc for golang to be able to link with it
write-host "Checking for gcc..."
get-command gcc
get-command mingw32-make
# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
# as we need this to be compiled by gcc for golang to be able to link with itx
write-host "Checking for MinGW..."
# error action ensures we exit on failure
get-command gcc
get-command mingw32-make
$oldTargets = $script:cmakeTargets
$script:cmakeTargets = @("llama", "ggml")
$script:cmakeDefs = @(
@@ -204,10 +211,11 @@ function build_static() {
}
}
function build_cpu_x64 {
function build_cpu($gen_arch) {
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
# remaining llama.cpp builds use MSVC
init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=off", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs
$script:cmakeDefs = $script:commonCpuDefs + @("-A", $gen_arch, "-DGGML_AVX=off", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu"
$script:distDir="$script:DIST_BASE\cpu"
write-host "Building LCD CPU"
@@ -219,32 +227,6 @@ function build_cpu_x64 {
}
}
function build_cpu_arm64 {
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
init_vars
write-host "Checking for clang..."
get-command clang
$env:CFLAGS="-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only"
$env:CXXFLAGS="$env:CFLAGS"
$env:LDFLAGS="-static-libstdc++"
$script:cmakeDefs = $script:commonCpuDefs + @(
"-DCMAKE_VERBOSE_MAKEFILE=on",
"-DCMAKE_C_COMPILER=clang.exe",
"-DCMAKE_CXX_COMPILER=clang++.exe",
"-DMSVC_RUNTIME_LIBRARY=MultiThreaded"
) + $script:cmakeDefs
$script:buildDir="../build/windows/${script:ARCH}/cpu"
$script:distDir="$script:DIST_BASE\cpu"
write-host "Building LCD CPU"
build
sign
install
} else {
write-host "Skipping CPU generation step as requested"
}
}
function build_cpu_avx() {
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) {
init_vars
@@ -369,7 +351,7 @@ function build_rocm() {
$script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
$script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
$script:cmakeDefs += @(
"-G", "Ninja",
"-G", "Ninja",
"-DCMAKE_C_COMPILER=clang.exe",
"-DCMAKE_CXX_COMPILER=clang++.exe",
"-DGGML_HIPBLAS=on",
@@ -418,9 +400,9 @@ if ($($args.count) -eq 0) {
apply_patches
build_static
if ($script:ARCH -eq "arm64") {
build_cpu_arm64
build_cpu("ARM64")
} else { # amd64
build_cpu_x64
build_cpu("x64")
build_cpu_avx
build_cpu_avx2
build_cuda
@@ -434,5 +416,5 @@ if ($($args.count) -eq 0) {
for ( $i = 0; $i -lt $args.count; $i++ ) {
write-host "performing $($args[$i])"
& $($args[$i])
}
}
}

View File

@@ -360,13 +360,11 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
switch llm.KV().Architecture() {
case "llama":
fullOffload = max(
4*batch*(1+4*embedding+context*(1+heads)),
4*batch*(embedding+vocab),
)
fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))
partialOffload = 4 * batch * embedding
partialOffload += max(
// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
4*batch*(embedding+vocab)+embedding*vocab*105/128,
)

View File

@@ -5,7 +5,7 @@ package llm
// #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
// #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
// #cgo windows,amd64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src
// #cgo windows,arm64 LDFLAGS: -lllama -lggml -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/arm64_static -L${SRCDIR}/build/windows/arm64_static/src -L${SRCDIR}/build/windows/arm64_static/ggml/src
// #cgo windows,arm64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/arm64_static -L${SRCDIR}/build/windows/arm64_static/src -L${SRCDIR}/build/windows/arm64_static/ggml/src
// #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux/x86_64_static -L${SRCDIR}/build/linux/x86_64_static/src -L${SRCDIR}/build/linux/x86_64_static/ggml/src
// #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux/arm64_static -L${SRCDIR}/build/linux/arm64_static/src -L${SRCDIR}/build/linux/arm64_static/ggml/src
// #include <stdlib.h>

11
llm/llm_darwin_amd64.go Normal file
View File

@@ -0,0 +1,11 @@
package llm
import (
"embed"
"syscall"
)
//go:embed build/darwin/x86_64/*/bin/*
var libEmbed embed.FS
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

View File

@@ -1,7 +1,11 @@
package llm
import (
"embed"
"syscall"
)
//go:embed build/darwin/arm64/*/bin/*
var libEmbed embed.FS
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

View File

@@ -1,7 +1,11 @@
package llm
import (
"embed"
"syscall"
)
//go:embed build/linux/*/*/bin/*
var libEmbed embed.FS
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

View File

@@ -1,13 +1,14 @@
package llm
import (
"embed"
"syscall"
)
const (
CREATE_DEFAULT_ERROR_MODE = 0x04000000
ABOVE_NORMAL_PRIORITY_CLASS = 0x00008000
)
// unused on windows
var libEmbed embed.FS
const CREATE_DEFAULT_ERROR_MODE = 0x04000000
var LlamaServerSysProcAttr = &syscall.SysProcAttr{
// Wire up the default error handling logic If for some reason a DLL is
@@ -15,8 +16,5 @@ var LlamaServerSysProcAttr = &syscall.SysProcAttr{
// the user can either fix their PATH, or report a bug. Without this
// setting, the process exits immediately with a generic exit status but no
// way to (easily) figure out what the actual missing DLL was.
//
// Setting Above Normal priority class ensures when running as a "background service"
// with "programs" given best priority, we aren't starved of cpu cycles
CreationFlags: CREATE_DEFAULT_ERROR_MODE | ABOVE_NORMAL_PRIORITY_CLASS,
CreationFlags: CREATE_DEFAULT_ERROR_MODE,
}

View File

@@ -7,7 +7,6 @@ import (
"strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
)
@@ -95,7 +94,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
// Overflow that didn't fit into the GPU
var overflow uint64
overhead := envconfig.GpuOverhead()
availableList := make([]string, len(gpus))
for i, gpu := range gpus {
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
@@ -166,22 +164,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
gzo = gpuZeroOverhead
}
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
if (gpus[i].FreeMemory - overhead) < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
slog.Debug("gpu has too little memory to allocate any layers",
"id", gpus[i].ID,
"library", gpus[i].Library,
"variant", gpus[i].Variant,
"compute", gpus[i].Compute,
"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
"name", gpus[i].Name,
"total", format.HumanBytes2(gpus[i].TotalMemory),
"available", format.HumanBytes2(gpus[i].FreeMemory),
"minimum_memory", gpus[i].MinimumMemory,
"layer_size", format.HumanBytes2(layerSize),
"gpu_zer_overhead", format.HumanBytes2(gzo),
"partial_offload", format.HumanBytes2(graphPartialOffload),
"full_offload", format.HumanBytes2(graphFullOffload),
)
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
continue
}
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
@@ -212,7 +196,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
for j := len(gpusWithSpace); j > 0; j-- {
g := gpusWithSpace[i%j]
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if (g.g.FreeMemory - overhead) > used+layerSize {
if g.g.FreeMemory > used+layerSize {
gpuAllocations[g.i] += layerSize
layerCounts[g.i]++
layerCount++
@@ -235,7 +219,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
for j := len(gpusWithSpace); j > 0; j-- {
g := gpusWithSpace[layerCount%j]
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if (g.g.FreeMemory - overhead) > used+memoryLayerOutput {
if g.g.FreeMemory > used+memoryLayerOutput {
gpuAllocations[g.i] += memoryLayerOutput
layerCounts[g.i]++
layerCount++
@@ -322,7 +306,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
}
func (m MemoryEstimate) log() {
overhead := envconfig.GpuOverhead()
slog.Info(
"offload to "+m.inferenceLibrary,
slog.Group(
@@ -340,7 +323,6 @@ func (m MemoryEstimate) log() {
"memory",
// memory available by GPU for offloading
"available", m.availableList,
"gpu_overhead", format.HumanBytes2(overhead),
slog.Group(
"required",
// memory required for full offloading

View File

@@ -1,22 +0,0 @@
From 8b8d83ffca775840acc5dc700f3b3703e9f5cfe4 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 23 Aug 2024 11:27:48 -0700
Subject: [PATCH] patch cmakelist
---
CMakeLists.txt | 2 ++
1 file changed, 2 insertions(+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a3132063..6a2a9912 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -199,3 +199,5 @@ if (LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
add_subdirectory(pocs)
endif()
+
+add_subdirectory(../ext_server ext_server) # ollama
--
2.45.2

View File

@@ -1,402 +0,0 @@
From 8313ce5f43f11f3d84f352f97f3802792e90e18c Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 16 Sep 2024 15:53:16 -0700
Subject: [PATCH] add solar-pro support
solar-pro introduces block skip connections where blocks are connected
to other, non-sequential blocks with a scale multiple
this change adds 4 new keys to store the skip connections and one new
tensor to store the scalar. the scalar is implemented a 1-dimensional
tensor with 2 elements dervied from the model's bskcn_tv configuration.
in general, the values are (bskcn_tv, 1 - bskcn_tv)
---
src/llama.cpp | 267 +++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 254 insertions(+), 13 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index f79bd782..b7771f53 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -213,6 +213,7 @@ enum llm_arch {
LLM_ARCH_NEMOTRON,
LLM_ARCH_EXAONE,
LLM_ARCH_RWKV6,
+ LLM_ARCH_SOLAR,
LLM_ARCH_UNKNOWN,
};
@@ -261,6 +262,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_NEMOTRON, "nemotron" },
{ LLM_ARCH_EXAONE, "exaone" },
{ LLM_ARCH_RWKV6, "rwkv6" },
+ { LLM_ARCH_SOLAR, "solar" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -314,6 +316,7 @@ enum llm_kv {
LLM_KV_ATTENTION_KV_LORA_RANK,
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
@@ -405,19 +408,20 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
- { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
- { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
- { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
- { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
- { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
- { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
- { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
- { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
- { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
- { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
- { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
- { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
- { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
+ { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
+ { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
+ { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
+ { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
+ { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
+ { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
+ { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
+ { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -589,6 +593,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_FFN_DOWN,
LLM_TENSOR_ENC_FFN_UP,
LLM_TENSOR_ENC_OUTPUT_NORM,
+ LLM_TENSOR_BSKCN_TV,
};
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -1408,6 +1413,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
},
},
+ {
+ LLM_ARCH_SOLAR,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_BSKCN_TV, "bskcn_tv" },
+ },
+ },
{
LLM_ARCH_UNKNOWN,
{
@@ -2237,6 +2260,7 @@ enum e_model {
MODEL_15B,
MODEL_16B,
MODEL_20B,
+ MODEL_22B,
MODEL_30B,
MODEL_34B,
MODEL_35B,
@@ -2284,6 +2308,8 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
+ std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
+
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -2349,6 +2375,7 @@ struct llama_hparams {
if (this->n_head_arr != other.n_head_arr) return true;
if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
if (this->n_ff_arr != other.n_ff_arr) return true;
+ if (this->n_bskcn_arr != other.n_bskcn_arr) return true;
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -2455,6 +2482,14 @@ struct llama_hparams {
return ssm_d_state * ssm_d_inner;
}
}
+
+ bool n_bskcn(uint32_t n, uint32_t il = 0) const {
+ if (il < n_layer) {
+ return n_bskcn_arr[n][il] > 0;
+ }
+
+ GGML_ABORT("fatal error");
+ }
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -2635,6 +2670,8 @@ struct llama_layer {
struct ggml_tensor * ffn_gate_scale;
struct ggml_tensor * ffn_up_scale;
struct ggml_tensor * ffn_down_scale;
+
+ struct ggml_tensor * bskcn_tv;
};
// very similar to llama_batch,
@@ -5937,6 +5974,21 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
+ case LLM_ARCH_SOLAR:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ for (int i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
+ auto & bskcn = hparams.n_bskcn_arr.at(i);
+ bskcn.fill(0);
+ ml.get_key_or_arr(::format(LLM_KV_NAMES.at(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION), LLM_ARCH_NAMES.at(ml.llm_kv.arch), i), bskcn, hparams.n_layer, false);
+ }
+
+ switch (hparams.n_layer) {
+ case 64: model.type = e_model::MODEL_22B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ }
default: (void)0;
}
@@ -8420,6 +8472,38 @@ static bool llm_load_tensors(
}
} break;
+ case LLM_ARCH_SOLAR:
+ {
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+ // output
+ {
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+
+ layer.bskcn_tv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_BSKCN_TV, "weight"), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ }
+ } break;
default:
throw std::runtime_error("unknown architecture");
}
@@ -15173,6 +15257,158 @@ struct llm_build_context {
return gf;
}
+
+ ggml_cgraph * build_solar() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
+ int32_t n_tokens = this->n_tokens;
+
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = build_inp_pos();
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+ struct ggml_tensor * bskcn_1;
+ struct ggml_tensor * bskcn_2;
+
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
+
+ if (hparams.n_bskcn(0, il)) {
+ bskcn_1 = inpSA;
+ }
+
+ if (hparams.n_bskcn(1, il)) {
+ bskcn_2 = inpSA;
+ }
+
+ if (hparams.n_bskcn(2, il)) {
+ inpSA = ggml_add(
+ ctx0,
+ ggml_mul(ctx0, bskcn_1, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
+ ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
+ }
+
+ if (hparams.n_bskcn(3, il)) {
+ inpSA = ggml_add(
+ ctx0,
+ ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
+ ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
+ }
+
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
+
+ // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Qcur, "Qcur", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Kcur, "Kcur", il);
+
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+ }
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ n_tokens = n_outputs;
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = llm_build_ffn(ctx0, lctx, cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
};
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -15423,6 +15659,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_rwkv6();
} break;
+ case LLM_ARCH_SOLAR:
+ {
+ result = llm.build_solar();
+ } break;
default:
GGML_ABORT("fatal error");
}
@@ -18503,6 +18743,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_ARCTIC:
case LLM_ARCH_DEEPSEEK2:
case LLM_ARCH_CHATGLM:
+ case LLM_ARCH_SOLAR:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
--
2.46.0

View File

@@ -1,693 +0,0 @@
From 52f526a86b6fdd50784678c02d8212edc2412a5b Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Tue, 24 Sep 2024 11:53:40 -0700
Subject: [PATCH] add mllama support
mllama adds cross-attention layers to the standard llama architecture
it also requires a way to input a new tensor: cross_attention_state
once per generation
cross-attention layers don't change and so they are cached in the
kv cache once per run
remaining is to implement the cross attention mask
---
include/llama.h | 4 +
src/llama.cpp | 456 ++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 447 insertions(+), 13 deletions(-)
diff --git a/include/llama.h b/include/llama.h
index bfc37e88..792520cc 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -449,6 +449,10 @@ extern "C" {
struct llama_model * model,
struct llama_context_params params);
+ // TODO (jmorganca): this should most likely be passed in as part of a batch
+ // and not set on the context for all batches.
+ LLAMA_API void llama_set_cross_attn_state(struct llama_context * ctx, float * cross_attn_state);
+
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama.cpp b/src/llama.cpp
index b7771f53..cf70ea90 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -170,6 +170,7 @@ static std::string format(const char * fmt, ...) {
enum llm_arch {
LLM_ARCH_LLAMA,
+ LLM_ARCH_MLLAMA,
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
LLM_ARCH_GROK,
@@ -219,6 +220,7 @@ enum llm_arch {
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" },
+ { LLM_ARCH_MLLAMA, "mllama" },
{ LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" },
{ LLM_ARCH_GPT2, "gpt2" },
@@ -317,6 +319,7 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
@@ -422,6 +425,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -594,6 +598,14 @@ enum llm_tensor {
LLM_TENSOR_ENC_FFN_UP,
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_BSKCN_TV,
+ LLM_TENSOR_CROSS_ATTN_K_NORM,
+ LLM_TENSOR_CROSS_ATTN_K_PROJ,
+ LLM_TENSOR_CROSS_ATTN_O_PROJ,
+ LLM_TENSOR_CROSS_ATTN_Q_NORM,
+ LLM_TENSOR_CROSS_ATTN_Q_PROJ,
+ LLM_TENSOR_CROSS_ATTN_V_PROJ,
+ LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
+ LLM_TENSOR_CROSS_ATTN_MLP_GATE,
};
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -623,6 +635,40 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
+ {
+ LLM_ARCH_MLLAMA,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
+ { LLM_TENSOR_CROSS_ATTN_K_NORM, "blk.%d.cross_attn_k_norm" },
+ { LLM_TENSOR_CROSS_ATTN_K_PROJ, "blk.%d.cross_attn_k_proj" },
+ { LLM_TENSOR_CROSS_ATTN_O_PROJ, "blk.%d.cross_attn_o_proj" },
+ { LLM_TENSOR_CROSS_ATTN_Q_NORM, "blk.%d.cross_attn_q_norm" },
+ { LLM_TENSOR_CROSS_ATTN_Q_PROJ, "blk.%d.cross_attn_q_proj" },
+ { LLM_TENSOR_CROSS_ATTN_V_PROJ, "blk.%d.cross_attn_v_proj" },
+ { LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" },
+ { LLM_TENSOR_CROSS_ATTN_MLP_GATE, "blk.%d.cross_attn_mlp_gate" },
+ },
+ },
{
LLM_ARCH_BAICHUAN,
{
@@ -2267,6 +2313,7 @@ enum e_model {
MODEL_40B,
MODEL_65B,
MODEL_70B,
+ MODEL_90B,
MODEL_236B,
MODEL_314B,
MODEL_SMALL,
@@ -2309,6 +2356,7 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
+ std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
@@ -2372,10 +2420,11 @@ struct llama_hparams {
if (this->n_expert != other.n_expert) return true;
if (this->n_expert_used != other.n_expert_used) return true;
- if (this->n_head_arr != other.n_head_arr) return true;
- if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
- if (this->n_ff_arr != other.n_ff_arr) return true;
- if (this->n_bskcn_arr != other.n_bskcn_arr) return true;
+ if (this->n_head_arr != other.n_head_arr) return true;
+ if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
+ if (this->n_ff_arr != other.n_ff_arr) return true;
+ if (this->n_bskcn_arr != other.n_bskcn_arr) return true;
+ if (this->cross_attn_layers != other.cross_attn_layers) return true;
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -2490,6 +2539,10 @@ struct llama_hparams {
GGML_ABORT("fatal error");
}
+
+ bool cross_attention_layer(uint32_t il) const {
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+ }
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -2672,6 +2725,16 @@ struct llama_layer {
struct ggml_tensor * ffn_down_scale;
struct ggml_tensor * bskcn_tv;
+
+ // cross attention
+ struct ggml_tensor * cross_attn_k_norm;
+ struct ggml_tensor * cross_attn_k_proj;
+ struct ggml_tensor * cross_attn_o_proj;
+ struct ggml_tensor * cross_attn_q_norm;
+ struct ggml_tensor * cross_attn_q_proj;
+ struct ggml_tensor * cross_attn_v_proj;
+ struct ggml_tensor * cross_attn_attn_gate;
+ struct ggml_tensor * cross_attn_mlp_gate;
};
// very similar to llama_batch,
@@ -3317,6 +3380,12 @@ struct llama_context {
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
+
+ // TODO (jmorganca): this should most likely be passed in as part of a batch
+ // and not set on the context for all batches.
+ float * cross_attn_state = nullptr;
+ bool cross_attn_state_first_pass = true;
+ struct ggml_tensor * inp_cross_attn_state; // F32 [4, n_embd, 1061]
};
struct llama_lora_weight {
@@ -3543,6 +3612,18 @@ static bool llama_kv_cache_init(
cache.v_l.reserve(n_layer);
for (int i = 0; i < (int) n_layer; i++) {
+ // for cross attention layers
+ if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layer(i)) {
+ struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
+ ggml_format_name(k, "cache_k_l%d", i);
+ ggml_format_name(v, "cache_v_l%d", i);
+ cache.k_l.push_back(k);
+ cache.v_l.push_back(v);
+ continue;
+ }
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
@@ -5312,12 +5393,14 @@ static void llm_load_hparams(
}
// zero-out the per-layer hparams
- std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
- std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
- std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
+ std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
+ std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
+ std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
+ std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
- ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
- ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
+ ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
+ ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
// n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -5366,7 +5449,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
- if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_MLLAMA || model.arch == LLM_ARCH_FALCON) {
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
@@ -5404,6 +5487,16 @@ static void llm_load_hparams(
}
}
} break;
+ case LLM_ARCH_MLLAMA:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 40: model.type = e_model::MODEL_11B; break;
+ case 100: model.type = e_model::MODEL_90B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_MINICPM:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6918,6 +7011,55 @@ static bool llm_load_tensors(
}
}
} break;
+ case LLM_ARCH_MLLAMA:
+ {
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8});
+
+ // output
+ {
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (model.output == NULL) {
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+ }
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ if (hparams.cross_attention_layer(i)) {
+ layer.cross_attn_k_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128});
+ layer.cross_attn_k_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024});
+ layer.cross_attn_o_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd});
+ layer.cross_attn_q_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128});
+ layer.cross_attn_q_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd});
+ layer.cross_attn_v_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024});
+ layer.cross_attn_attn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1});
+ layer.cross_attn_mlp_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1});
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ } else {
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ }
+ }
+ } break;
case LLM_ARCH_GROK:
{
if (n_expert == 0) {
@@ -8678,7 +8820,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
- throw std::runtime_error("vocab size mismatch");
+ LLAMA_LOG_WARN("%s: vocab mismatch %u !- %zu ...\n", __func__, model.hparams.n_vocab, model.vocab.id_to_token.size());
}
if (params.vocab_only) {
@@ -8759,7 +8901,7 @@ static struct ggml_tensor * llm_build_inp_embd(
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
} else {
- lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
+ lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
inpL = lctx.inp_embd;
ggml_set_input(lctx.inp_embd);
}
@@ -8769,6 +8911,22 @@ static struct ggml_tensor * llm_build_inp_embd(
return inpL;
}
+static struct ggml_tensor * llm_build_inp_cross_attn_state(
+ struct ggml_context * ctx,
+ struct llama_context & lctx,
+ const llama_hparams & hparams,
+ const llm_build_cb & cb) {
+ const int64_t n_embd = hparams.n_embd;
+
+ struct ggml_tensor * inpCAS;
+ lctx.inp_cross_attn_state = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1601, 4);
+ cb(lctx.inp_cross_attn_state, "inp_cross_attn_state", -1);
+ ggml_set_input(lctx.inp_cross_attn_state);
+ inpCAS = lctx.inp_cross_attn_state;
+
+ return inpCAS;
+}
+
static void llm_build_kv_store(
struct ggml_context * ctx,
const llama_hparams & hparams,
@@ -9743,6 +9901,7 @@ struct llm_build_context {
lctx.inp_pos_bucket = nullptr;
lctx.inp_embd_enc = nullptr;
lctx.inp_KQ_mask_cross = nullptr;
+ lctx.inp_cross_attn_state = nullptr;
}
void free() {
@@ -10158,6 +10317,253 @@ struct llm_build_context {
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
+ struct ggml_cgraph * build_mllama() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
+ int32_t n_tokens = this->n_tokens;
+
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+ struct ggml_tensor * inpCAS;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpCAS = llm_build_inp_cross_attn_state(ctx0, lctx, hparams, cb);
+
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = build_inp_pos();
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
+
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
+
+ if (hparams.cross_attention_layer(il)) {
+ if (!lctx.cross_attn_state) {
+ continue;
+ }
+
+ // cross attention layer
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
+ cb(Qcur, "Qcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ cb(Qcur, "Qcur", il);
+
+ Qcur = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+ cb(Qcur, "Qcur", il);
+
+ // TODO: is this required?
+ Qcur = ggml_cont(ctx0, Qcur);
+ cb(Qcur, "Qcur", il);
+
+ Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur;
+ if (lctx.cross_attn_state_first_pass) {
+ Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
+ cb(Kcur, "Kcur", il);
+
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
+ cb(Kcur, "Kcur", il);
+
+ Kcur = ggml_permute(ctx0, Kcur, 0, 2, 1, 3);
+ cb(Kcur, "Kcur", il);
+
+ // TODO: is this required?
+ Kcur = ggml_cont(ctx0, Kcur);
+ cb(Kcur, "Kcur", il);
+
+ Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
+ cb(Kcur, "Kcur", il);
+
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self.k_l[il]));
+ } else {
+ Kcur = ggml_view_tensor(ctx0, kv_self.k_l[il]);
+ cb(Kcur, "Kcur (view)", il);
+ }
+
+ struct ggml_tensor * Vcur;
+ if (lctx.cross_attn_state_first_pass) {
+ Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
+ cb(Vcur, "Vcur", il);
+
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
+ cb(Vcur, "Vcur", il);
+
+ Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
+ cb(Vcur, "Vcur", il);
+
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self.v_l[il]));
+ } else {
+ Vcur = ggml_view_tensor(ctx0, kv_self.v_l[il]);
+ cb(Vcur, "Vcur (view)", il);
+ }
+
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
+ cb(kq, "kq", il);
+
+ kq = ggml_scale_inplace(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
+ cb(kq, "kq_scaled", il);
+
+ // TODO: apply causal masks
+ struct ggml_tensor * kq_soft_max = ggml_soft_max_inplace(ctx0, kq);
+ cb(kq_soft_max, "kq_soft_max", il);
+
+ Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
+ cb(Vcur, "Vcur", il);
+
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
+ cb(kqv, "kqv", il);
+
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+ cb(kqv_merged, "kqv_merged", il);
+
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
+ cb(cur, "kqv_merged_cont", il);
+
+ cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
+ cb(cur, "cur", il);
+
+ // TODO: do this in place once?
+ cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
+
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = llm_build_ffn(ctx0, lctx, cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+
+ // TODO: do this inplace once?
+ cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ } else {
+ // self attention layer
+
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
+
+ // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il);
+ }
+
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il);
+ }
+
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il);
+ }
+
+ Qcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Qcur, "Qcur", il);
+
+ Kcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ cb(Kcur, "Kcur", il);
+
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ n_tokens = n_outputs;
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = llm_build_ffn(ctx0, lctx, cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+ }
+
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);
@@ -15493,6 +15899,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_llama();
} break;
+ case LLM_ARCH_MLLAMA:
+ {
+ result = llm.build_mllama();
+ } break;
case LLM_ARCH_BAICHUAN:
{
result = llm.build_baichuan();
@@ -15753,6 +16163,14 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
}
+ // TODO (jmorganca): this might copy a lot of data on every request of a
+ // single generation even though it doesn't change, so we should
+ // find a way to not set this more than one time per image
+ if (lctx.inp_cross_attn_state &&
+ lctx.inp_cross_attn_state->buffer) {
+ ggml_backend_tensor_set(lctx.inp_cross_attn_state, lctx.cross_attn_state, 0, hparams.n_embd * 1601 * 4 * ggml_element_size(lctx.inp_cross_attn_state));
+ }
+
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
const int64_t n_tokens = batch.n_tokens;
@@ -16430,6 +16848,10 @@ static int llama_decode_internal(
llama_set_inputs(lctx, ubatch);
+ // TODO: replace with something better to find out if its
+ // our first actual pass
+ lctx.cross_attn_state_first_pass = false;
+
llama_graph_compute(lctx, gf, n_threads, threadpool);
// update the kv ring buffer
@@ -17586,7 +18008,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+ if (qs.n_attention_wv != n_attn_layer) {
+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
+ }
}
size_t total_size_org = 0;
@@ -18681,6 +19105,11 @@ struct llama_context * llama_new_context_with_model(
return ctx;
}
+void llama_set_cross_attn_state(struct llama_context * ctx, float * cross_attn_state) {
+ ctx->cross_attn_state_first_pass = true;
+ ctx->cross_attn_state = cross_attn_state;
+}
+
void llama_free(struct llama_context * ctx) {
delete ctx;
}
@@ -18731,6 +19160,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
+ case LLM_ARCH_MLLAMA:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
case LLM_ARCH_PLAMO:
--
2.39.3 (Apple Git-146)

View File

@@ -1,18 +1,8 @@
From 2cfaa0a04faa9c87ba8f1ac8527eb953e69c6cde Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 16 Sep 2024 15:53:10 -0700
Subject: [PATCH] 01-load-progress.diff
---
common/common.cpp | 2 ++
common/common.h | 7 +++++++
2 files changed, 9 insertions(+)
diff --git a/common/common.cpp b/common/common.cpp
index 9fa18472..48ff41e9 100644
index 2c05a4d4..927f0e3d 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2573,6 +2573,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
@@ -2093,6 +2093,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
@@ -22,10 +12,10 @@ index 9fa18472..48ff41e9 100644
mparams.kv_overrides = NULL;
} else {
diff --git a/common/common.h b/common/common.h
index cb5e7f6d..d8f043f7 100644
index 65c0ef81..ebca2c77 100644
--- a/common/common.h
+++ b/common/common.h
@@ -204,6 +204,13 @@ struct gpt_params {
@@ -184,6 +184,13 @@ struct gpt_params {
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
@@ -39,6 +29,3 @@ index cb5e7f6d..d8f043f7 100644
// embedding
bool embedding = false; // get only sentence embedding
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
--
2.46.0

View File

@@ -1,14 +1,5 @@
From ba4bba80a744f76ac67b8234451c259a3c5da83b Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 16 Sep 2024 15:53:11 -0700
Subject: [PATCH] 02-clip-log.diff
---
examples/llava/clip.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 9b890571..cb51793d 100644
index e431c7f7..f077e688 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -3,6 +3,7 @@
@@ -19,6 +10,3 @@ index 9b890571..cb51793d 100644
#include "log.h"
#include "ggml.h"
#include "ggml-alloc.h"
--
2.46.0

View File

@@ -1,17 +1,8 @@
From e43bfd3f607a6dfcaba2d490d35f412a52e55e30 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 16 Sep 2024 15:53:12 -0700
Subject: [PATCH] 03-load_exception.diff
---
src/llama.cpp | 25 ++++++++++++++++---------
1 file changed, 16 insertions(+), 9 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index 88355971..926bb71a 100644
index 73f52435..58a00fb1 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8635,7 +8635,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
@@ -7241,7 +7241,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
}
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
@@ -20,7 +11,7 @@ index 88355971..926bb71a 100644
}
return 0;
@@ -18022,16 +18022,23 @@ struct llama_model * llama_load_model_from_file(
@@ -17564,16 +17564,23 @@ struct llama_model * llama_load_model_from_file(
}
model->rpc_servers.push_back(servers);
}
@@ -52,6 +43,3 @@ index 88355971..926bb71a 100644
}
return model;
--
2.46.0

View File

@@ -1,17 +1,8 @@
From 29411d9a9d2b6a0af6425ffe88498f17f71f7d5d Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 16 Sep 2024 15:53:12 -0700
Subject: [PATCH] 04-metal.diff
---
ggml/src/ggml-metal.m | 30 +++++++++++++-----------------
1 file changed, 13 insertions(+), 17 deletions(-)
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 91b5e61b..9cfa72ac 100644
index 0207b787..b5e9884b 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -1734,27 +1734,23 @@ static enum ggml_status ggml_metal_graph_compute(
@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute(
// to the matrix-vector kernel
int ne11_mm_min = 1;
@@ -52,6 +43,3 @@ index 91b5e61b..9cfa72ac 100644
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
--
2.46.0

View File

@@ -1,17 +1,8 @@
From b298ac8614d1e38da28f760eb1d2ae8af0fbbe62 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 16 Sep 2024 15:53:13 -0700
Subject: [PATCH] 05-default-pretokenizer.diff
---
src/llama.cpp | 14 +++-----------
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index 926bb71a..d1e959fc 100644
index a207451f..2ddf431d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6083,16 +6083,7 @@ static void llm_load_vocab(
@@ -5347,16 +5347,7 @@ static void llm_load_vocab(
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = true;
@@ -29,9 +20,9 @@ index 926bb71a..d1e959fc 100644
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -6188,7 +6179,8 @@ static void llm_load_vocab(
tokenizer_pre == "exaone") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
@@ -5443,7 +5434,8 @@ static void llm_load_vocab(
tokenizer_pre == "codeshell") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
@@ -39,6 +30,3 @@ index 926bb71a..d1e959fc 100644
}
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
--
2.46.0

View File

@@ -1,45 +1,37 @@
From c9a6ca9fc039233dee746a4da9705762cd9e515d Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 16 Sep 2024 15:53:14 -0700
Subject: [PATCH] 06-embeddings.diff
---
src/llama.cpp | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index d1e959fc..f79bd782 100644
index 1fe2b9f7..a43312a7 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -15898,7 +15898,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
@@ -13689,7 +13689,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
- const bool has_logits = !cparams.embeddings;
+ const bool has_logits = cparams.causal_attn;
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
@@ -16167,20 +16167,23 @@ static int llama_decode_internal(
@@ -13959,17 +13959,25 @@ static int llama_decode_internal(
// no output
res = nullptr;
embd = nullptr;
- } else if (cparams.embeddings) {
- res = nullptr; // do not extract logits for embedding case
- embd = nullptr;
- res = nullptr; // do not extract logits for embedding case
- embd = gf->nodes[gf->n_nodes - 1];
- if (strcmp(embd->name, "result_embd_pooled") != 0) {
- embd = gf->nodes[gf->n_nodes - 2];
+ }
+
+ if (cparams.embeddings) {
for (int i = gf->n_nodes - 1; i >= 0; --i) {
- if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
- embd = gf->nodes[i];
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
+ embd = gf->nodes[i];
+ if (strcmp(embd->name, "result_embd_pooled") == 0) {
break;
}
+ break;
+ }
}
- GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
} else {
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
- } else {
+ } else {
embd = nullptr; // do not extract embeddings when not needed
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
}
@@ -47,9 +39,7 @@ index d1e959fc..f79bd782 100644
+ if (!cparams.causal_attn) {
+ res = nullptr; // do not extract logits when not needed
+ }
+
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
ggml_backend_sched_alloc_graph(lctx.sched, gf);
--
2.46.0

Some files were not shown because too many files have changed in this diff Show More