mirror of
https://github.com/ollama/ollama.git
synced 2025-12-31 11:39:00 -05:00
Compare commits
1 Commits
scratch
...
cuda-searc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
be721ca0df |
106
.github/workflows/test.yaml
vendored
106
.github/workflows/test.yaml
vendored
@@ -1,106 +0,0 @@
|
||||
name: test
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
generate:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
arch: [amd64, arm64]
|
||||
exclude:
|
||||
- os: ubuntu-latest
|
||||
arch: arm64
|
||||
- os: windows-latest
|
||||
arch: arm64
|
||||
runs-on: ${{ matrix.os }}
|
||||
env:
|
||||
GOARCH: ${{ matrix.arch }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.21'
|
||||
cache: true
|
||||
- if: ${{ startsWith(matrix.os, 'windows-') }}
|
||||
shell: pwsh
|
||||
run: |
|
||||
$path = vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath
|
||||
if ($path) {
|
||||
$path = join-path $path 'Common7\Tools\vsdevcmd.bat'
|
||||
if (test-path $path) {
|
||||
cmd /s /c """$path"" $args && set" | where { $_ -match '(\w+)=(.*)' } | foreach {
|
||||
echo "$($Matches[1])=$($Matches[2])" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
echo "C:\Program Files\Git\usr\bin" | Out-File -FilePath $Env:GITHUB_PATH -Encoding utf8 -Append
|
||||
- run: go get ./...
|
||||
- run: go generate -x ./...
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
|
||||
path: |
|
||||
llm/llama.cpp/build/**/lib/*
|
||||
lint:
|
||||
needs: generate
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
arch: [amd64, arm64]
|
||||
exclude:
|
||||
- os: ubuntu-latest
|
||||
arch: arm64
|
||||
- os: windows-latest
|
||||
arch: arm64
|
||||
- os: macos-latest
|
||||
arch: amd64
|
||||
runs-on: ${{ matrix.os }}
|
||||
env:
|
||||
GOARCH: ${{ matrix.arch }}
|
||||
CGO_ENABLED: "1"
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: recursive
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.21'
|
||||
cache: false
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
|
||||
path: llm/llama.cpp/build
|
||||
- uses: golangci/golangci-lint-action@v3
|
||||
test:
|
||||
needs: generate
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
arch: [amd64]
|
||||
exclude:
|
||||
- os: ubuntu-latest
|
||||
arch: arm64
|
||||
- os: windows-latest
|
||||
arch: arm64
|
||||
runs-on: ${{ matrix.os }}
|
||||
env:
|
||||
GOARCH: ${{ matrix.arch }}
|
||||
CGO_ENABLED: "1"
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: recursive
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.21'
|
||||
cache: true
|
||||
- run: go get
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
|
||||
path: llm/llama.cpp/build
|
||||
- run: go build
|
||||
- run: go test -v ./...
|
||||
@@ -1,27 +0,0 @@
|
||||
run:
|
||||
timeout: 5m
|
||||
linters:
|
||||
enable:
|
||||
- asasalint
|
||||
- bidichk
|
||||
- bodyclose
|
||||
- containedctx
|
||||
- contextcheck
|
||||
- exportloopref
|
||||
- gocheckcompilerdirectives
|
||||
# FIXME: for some reason this errors on windows
|
||||
# - gofmt
|
||||
# - goimports
|
||||
- misspell
|
||||
- nilerr
|
||||
- unused
|
||||
linters-settings:
|
||||
errcheck:
|
||||
# exclude the following functions since we don't generally
|
||||
# need to be concerned with the returned errors
|
||||
exclude-functions:
|
||||
- encoding/binary.Read
|
||||
- (*os.File).Seek
|
||||
- (*bufio.Writer).WriteString
|
||||
- (*github.com/spf13/pflag.FlagSet).Set
|
||||
- (*github.com/jmorganca/ollama/llm.readSeekOffset).Seek
|
||||
152
Dockerfile.build
152
Dockerfile.build
@@ -1,99 +1,101 @@
|
||||
ARG GOLANG_VERSION=1.21.3
|
||||
ARG CMAKE_VERSION=3.22.1
|
||||
ARG CUDA_VERSION=11.3.1
|
||||
|
||||
# Copy the minimal context we need to run the generate scripts
|
||||
FROM scratch AS llm-code
|
||||
COPY .git .git
|
||||
COPY .gitmodules .gitmodules
|
||||
COPY llm llm
|
||||
ARG ROCM_VERSION=5.7.1
|
||||
|
||||
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
|
||||
|
||||
ARG CMAKE_VERSION
|
||||
ARG CGO_CFLAGS
|
||||
COPY ./scripts/rh_linux_deps.sh /
|
||||
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
||||
|
||||
RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl \
|
||||
&& yum update -y \
|
||||
&& yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236
|
||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
|
||||
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
|
||||
|
||||
ADD https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-linux-x86_64.tar.gz /tmp/cmake-$CMAKE_VERSION.tar.gz
|
||||
RUN tar -zx -C /usr --strip-components 1 </tmp/cmake-$CMAKE_VERSION.tar.gz
|
||||
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||
COPY . .
|
||||
|
||||
WORKDIR llm/generate
|
||||
RUN sh gen_linux.sh
|
||||
|
||||
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
|
||||
|
||||
ARG CMAKE_VERSION
|
||||
ARG CGO_CFLAGS
|
||||
COPY ./scripts/rh_linux_deps.sh /
|
||||
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
||||
ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
|
||||
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
|
||||
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
|
||||
|
||||
FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
|
||||
ARG CMAKE_VERSION
|
||||
ARG CGO_CFLAGS
|
||||
COPY ./scripts/rh_linux_deps.sh /
|
||||
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||
ENV LIBRARY_PATH /opt/amdgpu/lib64
|
||||
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
|
||||
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
|
||||
RUN dnf install -y git cmake
|
||||
|
||||
FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
|
||||
ARG CMAKE_VERSION
|
||||
ARG CGO_CFLAGS
|
||||
COPY ./scripts/rh_linux_deps.sh /
|
||||
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||
ENV LIBRARY_PATH /opt/amdgpu/lib64
|
||||
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
|
||||
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
|
||||
|
||||
FROM --platform=linux/amd64 centos:7 AS cpu-build-amd64
|
||||
ARG CMAKE_VERSION
|
||||
ARG GOLANG_VERSION
|
||||
ARG OLLAMA_CUSTOM_CPU_DEFS
|
||||
ARG CGO_CFLAGS
|
||||
COPY ./scripts/rh_linux_deps.sh /
|
||||
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
|
||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
|
||||
RUN sh gen_linux.sh
|
||||
|
||||
FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
|
||||
ARG CMAKE_VERSION
|
||||
ARG GOLANG_VERSION
|
||||
ARG OLLAMA_CUSTOM_CPU_DEFS
|
||||
ARG CGO_CFLAGS
|
||||
COPY ./scripts/rh_linux_deps.sh /
|
||||
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
|
||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||
COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
|
||||
RUN sh gen_linux.sh
|
||||
|
||||
|
||||
FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
|
||||
ENV CGO_ENABLED 1
|
||||
ARG GOFLAGS
|
||||
ARG CGO_CFLAGS
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||
COPY . .
|
||||
COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
|
||||
COPY --from=rocm-5-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
|
||||
COPY --from=rocm-6-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
|
||||
|
||||
WORKDIR llm/generate
|
||||
RUN sh gen_linux.sh
|
||||
|
||||
FROM --platform=linux/amd64 rocm/dev-centos-7:$ROCM_VERSION-complete AS rocm-build-amd64
|
||||
|
||||
ARG CMAKE_VERSION
|
||||
|
||||
RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl \
|
||||
&& yum update -y \
|
||||
&& yum remove -y git \
|
||||
&& yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236
|
||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||
ENV LIBRARY_PATH /opt/amdgpu/lib64
|
||||
|
||||
ADD https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-linux-x86_64.tar.gz /tmp/cmake-$CMAKE_VERSION.tar.gz
|
||||
RUN tar -zx -C /usr --strip-components 1 </tmp/cmake-$CMAKE_VERSION.tar.gz
|
||||
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||
COPY . .
|
||||
|
||||
WORKDIR llm/generate
|
||||
RUN sh gen_linux.sh
|
||||
|
||||
FROM --platform=linux/amd64 centos:7 AS build-amd64
|
||||
ENV CGO_ENABLED 1
|
||||
|
||||
ARG GOLANG_VERSION
|
||||
ARG GOFLAGS
|
||||
ARG CGO_FLAGS
|
||||
|
||||
RUN yum install -y centos-release-scl \
|
||||
&& yum update -y \
|
||||
&& yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++
|
||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||
|
||||
ADD https://dl.google.com/go/go$GOLANG_VERSION.linux-amd64.tar.gz /tmp/go-$GOLANG_VERSION.tar.gz
|
||||
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go-$GOLANG_VERSION.tar.gz
|
||||
ENV PATH /usr/local/go/bin:$PATH
|
||||
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||
COPY . .
|
||||
COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/cpu/lib llm/llama.cpp/build/linux/cpu/lib
|
||||
COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/cuda/lib llm/llama.cpp/build/linux/cuda/lib
|
||||
COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/rocm/lib llm/llama.cpp/build/linux/rocm/lib
|
||||
RUN go build .
|
||||
|
||||
FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
|
||||
FROM --platform=linux/arm64 centos:7 AS build-arm64
|
||||
ENV CGO_ENABLED 1
|
||||
|
||||
ARG GOLANG_VERSION
|
||||
ARG GOFLAGS
|
||||
ARG CGO_CFLAGS
|
||||
ARG CGO_FLAGS
|
||||
|
||||
RUN yum install -y centos-release-scl \
|
||||
&& yum update -y \
|
||||
&& yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++
|
||||
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||
|
||||
ADD https://dl.google.com/go/go$GOLANG_VERSION.linux-arm64.tar.gz /tmp/go-$GOLANG_VERSION.tar.gz
|
||||
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go-$GOLANG_VERSION.tar.gz
|
||||
ENV PATH /usr/local/go/bin:$PATH
|
||||
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||
COPY . .
|
||||
COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
|
||||
COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/cpu/lib llm/llama.cpp/build/linux/cpu/lib
|
||||
COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/cuda/lib llm/llama.cpp/build/linux/cuda/lib
|
||||
RUN go build .
|
||||
|
||||
FROM build-$TARGETARCH
|
||||
|
||||
@@ -248,10 +248,6 @@ curl http://localhost:11434/api/chat -d '{
|
||||
|
||||
See the [API documentation](./docs/api.md) for all endpoints.
|
||||
|
||||
## Integrations
|
||||
|
||||
- [ollama-python](https://github.com/jmorganca/ollama-python)
|
||||
|
||||
## Community Integrations
|
||||
|
||||
### Web & Desktop
|
||||
@@ -304,9 +300,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Ollama for Dart](https://github.com/breitburg/dart-ollama)
|
||||
- [Ollama for Laravel](https://github.com/cloudstudio/ollama-laravel)
|
||||
- [LangChainDart](https://github.com/davidmigloz/langchain_dart)
|
||||
- [Semantic Kernel - Python](https://github.com/microsoft/semantic-kernel/tree/main/python/semantic_kernel/connectors/ai/ollama)
|
||||
- [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
|
||||
|
||||
|
||||
### Mobile
|
||||
|
||||
@@ -327,4 +320,3 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
|
||||
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
|
||||
- [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
|
||||
- [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
|
||||
284
api/client.py
Normal file
284
api/client.py
Normal file
@@ -0,0 +1,284 @@
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
import os
|
||||
import hashlib
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
BASE_URL = os.environ.get('OLLAMA_HOST', 'http://localhost:11434')
|
||||
|
||||
# Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses.
|
||||
# The final response object will include statistics and additional data from the request. Use the callback function to override
|
||||
# the default handler.
|
||||
def generate(model_name, prompt, system=None, template=None, format="", context=None, options=None, callback=None):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/generate"
|
||||
payload = {
|
||||
"model": model_name,
|
||||
"prompt": prompt,
|
||||
"system": system,
|
||||
"template": template,
|
||||
"context": context,
|
||||
"options": options,
|
||||
"format": format,
|
||||
}
|
||||
|
||||
# Remove keys with None values
|
||||
payload = {k: v for k, v in payload.items() if v is not None}
|
||||
|
||||
with requests.post(url, json=payload, stream=True) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Creating a variable to hold the context history of the final chunk
|
||||
final_context = None
|
||||
|
||||
# Variable to hold concatenated response strings if no callback is provided
|
||||
full_response = ""
|
||||
|
||||
# Iterating over the response line by line and displaying the details
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
# Parsing each line (JSON chunk) and extracting the details
|
||||
chunk = json.loads(line)
|
||||
|
||||
# If a callback function is provided, call it with the chunk
|
||||
if callback:
|
||||
callback(chunk)
|
||||
else:
|
||||
# If this is not the last chunk, add the "response" field value to full_response and print it
|
||||
if not chunk.get("done"):
|
||||
response_piece = chunk.get("response", "")
|
||||
full_response += response_piece
|
||||
print(response_piece, end="", flush=True)
|
||||
|
||||
# Check if it's the last chunk (done is true)
|
||||
if chunk.get("done"):
|
||||
final_context = chunk.get("context")
|
||||
|
||||
# Return the full response and the final context
|
||||
return full_response, final_context
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None, None
|
||||
|
||||
|
||||
# Create a blob file on the server if it doesn't exist.
|
||||
def create_blob(digest, file_path):
|
||||
url = f"{BASE_URL}/api/blobs/{digest}"
|
||||
|
||||
# Check if the blob exists
|
||||
response = requests.head(url)
|
||||
if response.status_code != 404:
|
||||
return # Blob already exists, no need to upload
|
||||
response.raise_for_status()
|
||||
|
||||
# Upload the blob
|
||||
with open(file_path, 'rb') as file_data:
|
||||
requests.post(url, data=file_data)
|
||||
|
||||
|
||||
# Create a model from a Modelfile. Use the callback function to override the default handler.
|
||||
def create(model_name, filename, callback=None):
|
||||
try:
|
||||
file_path = Path(filename).expanduser().resolve()
|
||||
processed_lines = []
|
||||
|
||||
# Read and process the modelfile
|
||||
with open(file_path, 'r') as f:
|
||||
for line in f:
|
||||
# Skip empty or whitespace-only lines
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
command, args = line.split(maxsplit=1)
|
||||
|
||||
if command.upper() in ["FROM", "ADAPTER"]:
|
||||
path = Path(args.strip()).expanduser()
|
||||
|
||||
# Check if path is relative and resolve it
|
||||
if not path.is_absolute():
|
||||
path = (file_path.parent / path)
|
||||
|
||||
# Skip if file does not exist for "model", this is handled by the server
|
||||
if not path.exists():
|
||||
processed_lines.append(line)
|
||||
continue
|
||||
|
||||
# Calculate SHA-256 hash
|
||||
with open(path, 'rb') as bin_file:
|
||||
hash = hashlib.sha256()
|
||||
hash.update(bin_file.read())
|
||||
blob = f"sha256:{hash.hexdigest()}"
|
||||
|
||||
# Add the file to the remote server
|
||||
create_blob(blob, path)
|
||||
|
||||
# Replace path with digest in the line
|
||||
line = f"{command} @{blob}\n"
|
||||
|
||||
processed_lines.append(line)
|
||||
|
||||
# Combine processed lines back into a single string
|
||||
modelfile_content = '\n'.join(processed_lines)
|
||||
|
||||
url = f"{BASE_URL}/api/create"
|
||||
payload = {"name": model_name, "modelfile": modelfile_content}
|
||||
|
||||
# Making a POST request with the stream parameter set to True to handle streaming responses
|
||||
with requests.post(url, json=payload, stream=True) as response:
|
||||
response.raise_for_status()
|
||||
# Iterating over the response line by line and displaying the status
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
chunk = json.loads(line)
|
||||
if callback:
|
||||
callback(chunk)
|
||||
else:
|
||||
print(f"Status: {chunk.get('status')}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
|
||||
# Pull a model from a the model registry. Cancelled pulls are resumed from where they left off, and multiple
|
||||
# calls to will share the same download progress. Use the callback function to override the default handler.
|
||||
def pull(model_name, insecure=False, callback=None):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/pull"
|
||||
payload = {
|
||||
"name": model_name,
|
||||
"insecure": insecure
|
||||
}
|
||||
|
||||
# Making a POST request with the stream parameter set to True to handle streaming responses
|
||||
with requests.post(url, json=payload, stream=True) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Iterating over the response line by line and displaying the details
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
# Parsing each line (JSON chunk) and extracting the details
|
||||
chunk = json.loads(line)
|
||||
|
||||
# If a callback function is provided, call it with the chunk
|
||||
if callback:
|
||||
callback(chunk)
|
||||
else:
|
||||
# Print the status message directly to the console
|
||||
print(chunk.get('status', ''), end='', flush=True)
|
||||
|
||||
# If there's layer data, you might also want to print that (adjust as necessary)
|
||||
if 'digest' in chunk:
|
||||
print(f" - Digest: {chunk['digest']}", end='', flush=True)
|
||||
print(f" - Total: {chunk['total']}", end='', flush=True)
|
||||
print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
|
||||
else:
|
||||
print()
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
# Push a model to the model registry. Use the callback function to override the default handler.
|
||||
def push(model_name, insecure=False, callback=None):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/push"
|
||||
payload = {
|
||||
"name": model_name,
|
||||
"insecure": insecure
|
||||
}
|
||||
|
||||
# Making a POST request with the stream parameter set to True to handle streaming responses
|
||||
with requests.post(url, json=payload, stream=True) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Iterating over the response line by line and displaying the details
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
# Parsing each line (JSON chunk) and extracting the details
|
||||
chunk = json.loads(line)
|
||||
|
||||
# If a callback function is provided, call it with the chunk
|
||||
if callback:
|
||||
callback(chunk)
|
||||
else:
|
||||
# Print the status message directly to the console
|
||||
print(chunk.get('status', ''), end='', flush=True)
|
||||
|
||||
# If there's layer data, you might also want to print that (adjust as necessary)
|
||||
if 'digest' in chunk:
|
||||
print(f" - Digest: {chunk['digest']}", end='', flush=True)
|
||||
print(f" - Total: {chunk['total']}", end='', flush=True)
|
||||
print(f" - Completed: {chunk['completed']}", end='\n', flush=True)
|
||||
else:
|
||||
print()
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
# List models that are available locally.
|
||||
def list():
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/tags")
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
models = data.get('models', [])
|
||||
return models
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None
|
||||
|
||||
# Copy a model. Creates a model with another name from an existing model.
|
||||
def copy(source, destination):
|
||||
try:
|
||||
# Create the JSON payload
|
||||
payload = {
|
||||
"source": source,
|
||||
"destination": destination
|
||||
}
|
||||
|
||||
response = requests.post(f"{BASE_URL}/api/copy", json=payload)
|
||||
response.raise_for_status()
|
||||
|
||||
# If the request was successful, return a message indicating that the copy was successful
|
||||
return "Copy successful"
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None
|
||||
|
||||
# Delete a model and its data.
|
||||
def delete(model_name):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/delete"
|
||||
payload = {"name": model_name}
|
||||
response = requests.delete(url, json=payload)
|
||||
response.raise_for_status()
|
||||
return "Delete successful"
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None
|
||||
|
||||
# Show info about a model.
|
||||
def show(model_name):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/show"
|
||||
payload = {"name": model_name}
|
||||
response = requests.post(url, json=payload)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse the JSON response and return it
|
||||
data = response.json()
|
||||
return data
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None
|
||||
|
||||
def heartbeat():
|
||||
try:
|
||||
url = f"{BASE_URL}/"
|
||||
response = requests.head(url)
|
||||
response.raise_for_status()
|
||||
return "Ollama is running"
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return "Ollama is not running"
|
||||
23
api/types.go
23
api/types.go
@@ -137,31 +137,23 @@ type EmbeddingResponse struct {
|
||||
}
|
||||
|
||||
type CreateRequest struct {
|
||||
Model string `json:"model"`
|
||||
Name string `json:"name"`
|
||||
Path string `json:"path"`
|
||||
Modelfile string `json:"modelfile"`
|
||||
Stream *bool `json:"stream,omitempty"`
|
||||
|
||||
// Name is deprecated, see Model
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
type DeleteRequest struct {
|
||||
Model string `json:"model"`
|
||||
|
||||
// Name is deprecated, see Model
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
type ShowRequest struct {
|
||||
Name string `json:"name"`
|
||||
Model string `json:"model"`
|
||||
System string `json:"system"`
|
||||
Template string `json:"template"`
|
||||
|
||||
Options map[string]interface{} `json:"options"`
|
||||
|
||||
// Name is deprecated, see Model
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
type ShowResponse struct {
|
||||
@@ -179,14 +171,11 @@ type CopyRequest struct {
|
||||
}
|
||||
|
||||
type PullRequest struct {
|
||||
Model string `json:"model"`
|
||||
Name string `json:"name"`
|
||||
Insecure bool `json:"insecure,omitempty"`
|
||||
Username string `json:"username"`
|
||||
Password string `json:"password"`
|
||||
Stream *bool `json:"stream,omitempty"`
|
||||
|
||||
// Name is deprecated, see Model
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
type ProgressResponse struct {
|
||||
@@ -197,14 +186,11 @@ type ProgressResponse struct {
|
||||
}
|
||||
|
||||
type PushRequest struct {
|
||||
Model string `json:"model"`
|
||||
Name string `json:"name"`
|
||||
Insecure bool `json:"insecure,omitempty"`
|
||||
Username string `json:"username"`
|
||||
Password string `json:"password"`
|
||||
Stream *bool `json:"stream,omitempty"`
|
||||
|
||||
// Name is deprecated, see Model
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
type ListResponse struct {
|
||||
@@ -213,7 +199,6 @@ type ListResponse struct {
|
||||
|
||||
type ModelResponse struct {
|
||||
Name string `json:"name"`
|
||||
Model string `json:"model"`
|
||||
ModifiedAt time.Time `json:"modified_at"`
|
||||
Size int64 `json:"size"`
|
||||
Digest string `json:"digest"`
|
||||
|
||||
178
cmd/cmd.go
178
cmd/cmd.go
@@ -35,6 +35,8 @@ import (
|
||||
"github.com/jmorganca/ollama/version"
|
||||
)
|
||||
|
||||
type ImageData []byte
|
||||
|
||||
func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||
filename, _ := cmd.Flags().GetString("file")
|
||||
filename, err := filepath.Abs(filename)
|
||||
@@ -413,10 +415,11 @@ func PullHandler(cmd *cobra.Command, args []string) error {
|
||||
func RunGenerate(cmd *cobra.Command, args []string) error {
|
||||
interactive := true
|
||||
|
||||
opts := runOptions{
|
||||
opts := generateOptions{
|
||||
Model: args[0],
|
||||
WordWrap: os.Getenv("TERM") == "xterm-256color",
|
||||
Options: map[string]interface{}{},
|
||||
Images: []ImageData{},
|
||||
}
|
||||
|
||||
format, err := cmd.Flags().GetString("format")
|
||||
@@ -457,135 +460,18 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
|
||||
|
||||
type generateContextKey string
|
||||
|
||||
type runOptions struct {
|
||||
type generateOptions struct {
|
||||
Model string
|
||||
Prompt string
|
||||
Messages []api.Message
|
||||
WordWrap bool
|
||||
Format string
|
||||
System string
|
||||
Template string
|
||||
Images []api.ImageData
|
||||
Images []ImageData
|
||||
Options map[string]interface{}
|
||||
}
|
||||
|
||||
type displayResponseState struct {
|
||||
lineLength int
|
||||
wordBuffer string
|
||||
}
|
||||
|
||||
func displayResponse(content string, wordWrap bool, state *displayResponseState) {
|
||||
termWidth, _, _ := term.GetSize(int(os.Stdout.Fd()))
|
||||
if wordWrap && termWidth >= 10 {
|
||||
for _, ch := range content {
|
||||
if state.lineLength+1 > termWidth-5 {
|
||||
if len(state.wordBuffer) > termWidth-10 {
|
||||
fmt.Printf("%s%c", state.wordBuffer, ch)
|
||||
state.wordBuffer = ""
|
||||
state.lineLength = 0
|
||||
continue
|
||||
}
|
||||
|
||||
// backtrack the length of the last word and clear to the end of the line
|
||||
fmt.Printf("\x1b[%dD\x1b[K\n", len(state.wordBuffer))
|
||||
fmt.Printf("%s%c", state.wordBuffer, ch)
|
||||
state.lineLength = len(state.wordBuffer) + 1
|
||||
} else {
|
||||
fmt.Print(string(ch))
|
||||
state.lineLength += 1
|
||||
|
||||
switch ch {
|
||||
case ' ':
|
||||
state.wordBuffer = ""
|
||||
case '\n':
|
||||
state.lineLength = 0
|
||||
default:
|
||||
state.wordBuffer += string(ch)
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
fmt.Printf("%s%s", state.wordBuffer, content)
|
||||
if len(state.wordBuffer) > 0 {
|
||||
state.wordBuffer = ""
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
p := progress.NewProgress(os.Stderr)
|
||||
defer p.StopAndClear()
|
||||
|
||||
spinner := progress.NewSpinner("")
|
||||
p.Add("", spinner)
|
||||
|
||||
cancelCtx, cancel := context.WithCancel(cmd.Context())
|
||||
defer cancel()
|
||||
|
||||
sigChan := make(chan os.Signal, 1)
|
||||
signal.Notify(sigChan, syscall.SIGINT)
|
||||
|
||||
go func() {
|
||||
<-sigChan
|
||||
cancel()
|
||||
}()
|
||||
|
||||
var state *displayResponseState = &displayResponseState{}
|
||||
var latest api.ChatResponse
|
||||
var fullResponse strings.Builder
|
||||
var role string
|
||||
|
||||
fn := func(response api.ChatResponse) error {
|
||||
p.StopAndClear()
|
||||
|
||||
latest = response
|
||||
|
||||
role = response.Message.Role
|
||||
content := response.Message.Content
|
||||
fullResponse.WriteString(content)
|
||||
|
||||
displayResponse(content, opts.WordWrap, state)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
req := &api.ChatRequest{
|
||||
Model: opts.Model,
|
||||
Messages: opts.Messages,
|
||||
Format: opts.Format,
|
||||
Options: opts.Options,
|
||||
}
|
||||
|
||||
if err := client.Chat(cancelCtx, req, fn); err != nil {
|
||||
if errors.Is(err, context.Canceled) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(opts.Messages) > 0 {
|
||||
fmt.Println()
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
verbose, err := cmd.Flags().GetBool("verbose")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if verbose {
|
||||
latest.Summary()
|
||||
}
|
||||
|
||||
return &api.Message{Role: role, Content: fullResponse.String()}, nil
|
||||
}
|
||||
|
||||
func generate(cmd *cobra.Command, opts runOptions) error {
|
||||
func generate(cmd *cobra.Command, opts generateOptions) error {
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -604,6 +490,11 @@ func generate(cmd *cobra.Command, opts runOptions) error {
|
||||
generateContext = []int{}
|
||||
}
|
||||
|
||||
termWidth, _, err := term.GetSize(int(os.Stdout.Fd()))
|
||||
if err != nil {
|
||||
opts.WordWrap = false
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(cmd.Context())
|
||||
defer cancel()
|
||||
|
||||
@@ -615,19 +506,57 @@ func generate(cmd *cobra.Command, opts runOptions) error {
|
||||
cancel()
|
||||
}()
|
||||
|
||||
var state *displayResponseState = &displayResponseState{}
|
||||
var currentLineLength int
|
||||
var wordBuffer string
|
||||
|
||||
fn := func(response api.GenerateResponse) error {
|
||||
p.StopAndClear()
|
||||
|
||||
latest = response
|
||||
content := response.Response
|
||||
|
||||
displayResponse(content, opts.WordWrap, state)
|
||||
termWidth, _, _ = term.GetSize(int(os.Stdout.Fd()))
|
||||
if opts.WordWrap && termWidth >= 10 {
|
||||
for _, ch := range response.Response {
|
||||
if currentLineLength+1 > termWidth-5 {
|
||||
if len(wordBuffer) > termWidth-10 {
|
||||
fmt.Printf("%s%c", wordBuffer, ch)
|
||||
wordBuffer = ""
|
||||
currentLineLength = 0
|
||||
continue
|
||||
}
|
||||
|
||||
// backtrack the length of the last word and clear to the end of the line
|
||||
fmt.Printf("\x1b[%dD\x1b[K\n", len(wordBuffer))
|
||||
fmt.Printf("%s%c", wordBuffer, ch)
|
||||
currentLineLength = len(wordBuffer) + 1
|
||||
} else {
|
||||
fmt.Print(string(ch))
|
||||
currentLineLength += 1
|
||||
|
||||
switch ch {
|
||||
case ' ':
|
||||
wordBuffer = ""
|
||||
case '\n':
|
||||
currentLineLength = 0
|
||||
default:
|
||||
wordBuffer += string(ch)
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
fmt.Printf("%s%s", wordBuffer, response.Response)
|
||||
if len(wordBuffer) > 0 {
|
||||
wordBuffer = ""
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
images := make([]api.ImageData, 0)
|
||||
for _, i := range opts.Images {
|
||||
images = append(images, api.ImageData(i))
|
||||
}
|
||||
request := api.GenerateRequest{
|
||||
Model: opts.Model,
|
||||
Prompt: opts.Prompt,
|
||||
@@ -636,6 +565,7 @@ func generate(cmd *cobra.Command, opts runOptions) error {
|
||||
System: opts.System,
|
||||
Template: opts.Template,
|
||||
Options: opts.Options,
|
||||
Images: images,
|
||||
}
|
||||
|
||||
if err := client.Generate(ctx, &request, fn); err != nil {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -42,16 +43,16 @@ func modelIsMultiModal(cmd *cobra.Command, name string) bool {
|
||||
return slices.Contains(resp.Details.Families, "clip")
|
||||
}
|
||||
|
||||
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
|
||||
multiModal := modelIsMultiModal(cmd, opts.Model)
|
||||
|
||||
// load the model
|
||||
loadOpts := runOptions{
|
||||
Model: opts.Model,
|
||||
Prompt: "",
|
||||
Messages: []api.Message{},
|
||||
loadOpts := generateOptions{
|
||||
Model: opts.Model,
|
||||
Prompt: "",
|
||||
Images: []ImageData{},
|
||||
}
|
||||
if _, err := chat(cmd, loadOpts); err != nil {
|
||||
if err := generate(cmd, loadOpts); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -140,7 +141,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
|
||||
var sb strings.Builder
|
||||
var multiline MultilineState
|
||||
opts.Messages = make([]api.Message, 0)
|
||||
|
||||
for {
|
||||
line, err := scanner.Readline()
|
||||
@@ -238,13 +238,16 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
usageParameters()
|
||||
continue
|
||||
}
|
||||
params := args[3:]
|
||||
var params []string
|
||||
for _, p := range args[3:] {
|
||||
params = append(params, p)
|
||||
}
|
||||
fp, err := api.FormatParams(map[string][]string{args[2]: params})
|
||||
if err != nil {
|
||||
fmt.Printf("Couldn't set parameter: %q\n", err)
|
||||
fmt.Printf("Couldn't set parameter: %q\n\n", err)
|
||||
continue
|
||||
}
|
||||
fmt.Printf("Set parameter '%s' to '%s'\n", args[2], strings.Join(params, ", "))
|
||||
fmt.Printf("Set parameter '%s' to '%s'\n\n", args[2], strings.Join(params, ", "))
|
||||
opts.Options[args[2]] = fp[args[2]]
|
||||
case "system", "template":
|
||||
if len(args) < 3 {
|
||||
@@ -325,7 +328,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
fmt.Println("")
|
||||
case "license":
|
||||
if resp.License == "" {
|
||||
fmt.Println("No license was specified for this model.")
|
||||
fmt.Print("No license was specified for this model.\n\n")
|
||||
} else {
|
||||
fmt.Println(resp.License)
|
||||
}
|
||||
@@ -333,7 +336,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
fmt.Println(resp.Modelfile)
|
||||
case "parameters":
|
||||
if resp.Parameters == "" {
|
||||
fmt.Println("No parameters were specified for this model.")
|
||||
fmt.Print("No parameters were specified for this model.\n\n")
|
||||
} else {
|
||||
if len(opts.Options) > 0 {
|
||||
fmt.Println("User defined parameters:")
|
||||
@@ -352,7 +355,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
case resp.System != "":
|
||||
fmt.Println(resp.System + "\n")
|
||||
default:
|
||||
fmt.Println("No system message was specified for this model.")
|
||||
fmt.Print("No system message was specified for this model.\n\n")
|
||||
}
|
||||
case "template":
|
||||
switch {
|
||||
@@ -361,7 +364,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
case resp.Template != "":
|
||||
fmt.Println(resp.Template)
|
||||
default:
|
||||
fmt.Println("No prompt template was specified for this model.")
|
||||
fmt.Print("No prompt template was specified for this model.\n\n")
|
||||
}
|
||||
default:
|
||||
fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
|
||||
@@ -409,26 +412,22 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
}
|
||||
|
||||
if sb.Len() > 0 && multiline == MultilineNone {
|
||||
newMessage := api.Message{Role: "user", Content: sb.String()}
|
||||
|
||||
opts.Prompt = sb.String()
|
||||
if multiModal {
|
||||
msg, images, err := extractFileData(sb.String())
|
||||
newPrompt, images, err := extractFileData(sb.String())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
newMessage.Content = msg
|
||||
opts.Prompt = newPrompt
|
||||
|
||||
// reset the context if we find another image
|
||||
if len(images) > 0 {
|
||||
newMessage.Images = append(newMessage.Images, images...)
|
||||
// reset the context for the new image
|
||||
opts.Messages = []api.Message{}
|
||||
} else {
|
||||
if len(opts.Messages) > 1 {
|
||||
newMessage.Images = append(newMessage.Images, opts.Messages[len(opts.Messages)-2].Images...)
|
||||
}
|
||||
opts.Images = images
|
||||
ctx := cmd.Context()
|
||||
ctx = context.WithValue(ctx, generateContextKey("context"), []int{})
|
||||
cmd.SetContext(ctx)
|
||||
}
|
||||
if len(newMessage.Images) == 0 {
|
||||
if len(opts.Images) == 0 {
|
||||
fmt.Println("This model requires you to add a jpeg, png, or svg image.")
|
||||
fmt.Println()
|
||||
sb.Reset()
|
||||
@@ -436,18 +435,9 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
}
|
||||
}
|
||||
|
||||
if opts.System != "" {
|
||||
opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System})
|
||||
}
|
||||
opts.Messages = append(opts.Messages, newMessage)
|
||||
|
||||
assistant, err := chat(cmd, opts)
|
||||
if err != nil {
|
||||
if err := generate(cmd, opts); err != nil {
|
||||
return err
|
||||
}
|
||||
if assistant != nil {
|
||||
opts.Messages = append(opts.Messages, *assistant)
|
||||
}
|
||||
|
||||
sb.Reset()
|
||||
}
|
||||
@@ -489,9 +479,9 @@ func extractFileNames(input string) []string {
|
||||
return re.FindAllString(input, -1)
|
||||
}
|
||||
|
||||
func extractFileData(input string) (string, []api.ImageData, error) {
|
||||
func extractFileData(input string) (string, []ImageData, error) {
|
||||
filePaths := extractFileNames(input)
|
||||
var imgs []api.ImageData
|
||||
var imgs []ImageData
|
||||
|
||||
for _, fp := range filePaths {
|
||||
nfp := normalizeFilePath(fp)
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
# Development
|
||||
|
||||
- Install cmake or (optionally, required tools for GPUs)
|
||||
- run `go generate ./...`
|
||||
- run `go build .`
|
||||
|
||||
Install required tools:
|
||||
|
||||
- cmake version 3.24 or higher
|
||||
- go version 1.21 or higher
|
||||
- go version 1.20 or higher
|
||||
- gcc version 11.4.0 or higher
|
||||
|
||||
```bash
|
||||
@@ -13,11 +17,7 @@ brew install go cmake gcc
|
||||
Optionally enable debugging and more verbose logging:
|
||||
|
||||
```bash
|
||||
# At build time
|
||||
export CGO_CFLAGS="-g"
|
||||
|
||||
# At runtime
|
||||
export OLLAMA_DEBUG=1
|
||||
```
|
||||
|
||||
Get the required libraries and build the native LLM code:
|
||||
@@ -44,14 +44,7 @@ Now you can run `ollama`:
|
||||
|
||||
*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
|
||||
|
||||
Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
|
||||
development and runtime packages.
|
||||
|
||||
Typically the build scripts will auto-detect CUDA, however, if your Linux distro
|
||||
or installation approach uses unusual paths, you can specify the location by
|
||||
specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
|
||||
libraries, and `CUDACXX` to the location of the nvcc compiler.
|
||||
|
||||
Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) development and runtime packages.
|
||||
Then generate dependencies:
|
||||
|
||||
```
|
||||
@@ -69,15 +62,10 @@ go build .
|
||||
*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
|
||||
|
||||
Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
|
||||
|
||||
Typically the build scripts will auto-detect ROCm, however, if your Linux distro
|
||||
or installation approach uses unusual paths, you can specify the location by
|
||||
specifying an environment variable `ROCM_PATH` to the location of the ROCm
|
||||
install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
|
||||
CLBlast install (typically `/usr/lib/cmake/CLBlast`).
|
||||
Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies:
|
||||
|
||||
```
|
||||
go generate ./...
|
||||
CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate ./...
|
||||
```
|
||||
|
||||
Then build the binary:
|
||||
@@ -88,22 +76,6 @@ go build .
|
||||
|
||||
ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
|
||||
|
||||
#### Advanced CPU Settings
|
||||
|
||||
By default, running `go generate ./...` will compile a few different variations
|
||||
of the LLM library based on common CPU families and vector math capabilities,
|
||||
including a lowest-common-denominator which should run on almost any 64 bit CPU
|
||||
somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
|
||||
load. If you would like to build a CPU-based build customized for your
|
||||
processor, you can set `OLLAMA_CUSTOM_CPU_DEFS` to the llama.cpp flags you would
|
||||
like to use. For example, to compile an optimized binary for an Intel i9-9880H,
|
||||
you might use:
|
||||
|
||||
```
|
||||
OLLAMA_CUSTOM_CPU_DEFS="-DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_F16C=on -DLLAMA_FMA=on" go generate ./...
|
||||
go build .
|
||||
```
|
||||
|
||||
#### Containerized Linux Build
|
||||
|
||||
If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
|
||||
@@ -116,7 +88,7 @@ Note: The windows build for Ollama is still under development.
|
||||
Install required tools:
|
||||
|
||||
- MSVC toolchain - C/C++ and cmake as minimal requirements
|
||||
- go version 1.21 or higher
|
||||
- go version 1.20 or higher
|
||||
- MinGW (pick one variant) with GCC.
|
||||
- <https://www.mingw-w64.org/>
|
||||
- <https://www.msys2.org/>
|
||||
|
||||
@@ -109,9 +109,8 @@ Remove the ollama binary from your bin directory (either `/usr/local/bin`, `/usr
|
||||
sudo rm $(which ollama)
|
||||
```
|
||||
|
||||
Remove the downloaded models and Ollama service user and group:
|
||||
Remove the downloaded models and Ollama service user:
|
||||
```bash
|
||||
sudo rm -r /usr/share/ollama
|
||||
sudo userdel ollama
|
||||
sudo groupdel ollama
|
||||
```
|
||||
|
||||
@@ -16,38 +16,7 @@ If manually running `ollama serve` in a terminal, the logs will be on that termi
|
||||
|
||||
Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.
|
||||
|
||||
## LLM libraries
|
||||
|
||||
Ollama includes multiple LLM libraries compiled for different GPUs and CPU
|
||||
vector features. Ollama tries to pick the best one based on the capabilities of
|
||||
your system. If this autodetection has problems, or you run into other problems
|
||||
(e.g. crashes in your GPU) you can workaround this by forcing a specific LLM
|
||||
library. `cpu_avx2` will perform the best, followed by `cpu_avx` an the slowest
|
||||
but most compatible is `cpu`. Rosetta emulation under MacOS will work with the
|
||||
`cpu` library.
|
||||
|
||||
In the server log, you will see a message that looks something like this (varies
|
||||
from release to release):
|
||||
|
||||
```
|
||||
Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
|
||||
```
|
||||
|
||||
**Experimental LLM Library Override**
|
||||
|
||||
You can set OLLAMA_LLM_LIBRARY to any of the available LLM libraries to bypass
|
||||
autodetection, so for example, if you have a CUDA card, but want to force the
|
||||
CPU LLM library with AVX2 vector support, use:
|
||||
|
||||
```
|
||||
OLLAMA_LLM_LIBRARY="cpu_avx2" ollama serve
|
||||
```
|
||||
|
||||
You can see what features your CPU has with the following.
|
||||
```
|
||||
cat /proc/cpuinfo| grep flags | head -1
|
||||
```
|
||||
|
||||
## Known issues
|
||||
|
||||
* N/A
|
||||
|
||||
* `signal: illegal instruction (core dumped)`: Ollama requires AVX support from the CPU. This was introduced in 2011 and CPUs started offering it in 2012. CPUs from before that and some lower end CPUs after that may not have AVX support and thus are not supported by Ollama. Some users have had luck with building Ollama on their machines disabling the need for AVX.
|
||||
|
||||
4
go.mod
4
go.mod
@@ -1,6 +1,6 @@
|
||||
module github.com/jmorganca/ollama
|
||||
|
||||
go 1.21
|
||||
go 1.20
|
||||
|
||||
require (
|
||||
github.com/emirpasic/gods v1.18.1
|
||||
@@ -45,7 +45,7 @@ require (
|
||||
golang.org/x/crypto v0.14.0
|
||||
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63
|
||||
golang.org/x/net v0.17.0 // indirect
|
||||
golang.org/x/sys v0.13.0
|
||||
golang.org/x/sys v0.13.0 // indirect
|
||||
golang.org/x/term v0.13.0
|
||||
golang.org/x/text v0.13.0 // indirect
|
||||
google.golang.org/protobuf v1.30.0 // indirect
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
package gpu
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
|
||||
"golang.org/x/sys/cpu"
|
||||
)
|
||||
|
||||
func GetCPUVariant() string {
|
||||
if cpu.X86.HasAVX2 {
|
||||
slog.Info("CPU has AVX2")
|
||||
return "avx2"
|
||||
}
|
||||
if cpu.X86.HasAVX {
|
||||
slog.Info("CPU has AVX")
|
||||
return "avx"
|
||||
}
|
||||
slog.Info("CPU does not have vector extensions")
|
||||
// else LCD
|
||||
return ""
|
||||
}
|
||||
214
gpu/gpu.go
214
gpu/gpu.go
@@ -12,11 +12,8 @@ package gpu
|
||||
import "C"
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"log"
|
||||
"runtime"
|
||||
"strings"
|
||||
"sync"
|
||||
"unsafe"
|
||||
)
|
||||
@@ -32,80 +29,31 @@ var gpuHandles *handles = nil
|
||||
// With our current CUDA compile flags, 5.2 and older will not work properly
|
||||
const CudaComputeMajorMin = 6
|
||||
|
||||
// Possible locations for the nvidia-ml library
|
||||
var CudaLinuxGlobs = []string{
|
||||
"/usr/local/cuda/lib64/libnvidia-ml.so*",
|
||||
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
|
||||
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
|
||||
"/usr/lib/wsl/lib/libnvidia-ml.so*",
|
||||
"/opt/cuda/lib64/libnvidia-ml.so*",
|
||||
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
|
||||
"/usr/lib*/libnvidia-ml.so*",
|
||||
"/usr/local/lib*/libnvidia-ml.so*",
|
||||
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
|
||||
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
|
||||
}
|
||||
|
||||
var CudaWindowsGlobs = []string{
|
||||
"c:\\Windows\\System32\\nvml.dll",
|
||||
}
|
||||
|
||||
var RocmLinuxGlobs = []string{
|
||||
"/opt/rocm*/lib*/librocm_smi64.so*",
|
||||
}
|
||||
|
||||
var RocmWindowsGlobs = []string{
|
||||
"c:\\Windows\\System32\\rocm_smi64.dll",
|
||||
}
|
||||
|
||||
// Note: gpuMutex must already be held
|
||||
func initGPUHandles() {
|
||||
|
||||
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
||||
|
||||
log.Printf("Detecting GPU type")
|
||||
gpuHandles = &handles{nil, nil}
|
||||
var cudaMgmtName string
|
||||
var cudaMgmtPatterns []string
|
||||
var rocmMgmtName string
|
||||
var rocmMgmtPatterns []string
|
||||
switch runtime.GOOS {
|
||||
case "windows":
|
||||
cudaMgmtName = "nvml.dll"
|
||||
cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs))
|
||||
copy(cudaMgmtPatterns, CudaWindowsGlobs)
|
||||
rocmMgmtName = "rocm_smi64.dll"
|
||||
rocmMgmtPatterns = make([]string, len(RocmWindowsGlobs))
|
||||
copy(rocmMgmtPatterns, RocmWindowsGlobs)
|
||||
case "linux":
|
||||
cudaMgmtName = "libnvidia-ml.so"
|
||||
cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs))
|
||||
copy(cudaMgmtPatterns, CudaLinuxGlobs)
|
||||
rocmMgmtName = "librocm_smi64.so"
|
||||
rocmMgmtPatterns = make([]string, len(RocmLinuxGlobs))
|
||||
copy(rocmMgmtPatterns, RocmLinuxGlobs)
|
||||
default:
|
||||
return
|
||||
}
|
||||
var resp C.cuda_init_resp_t
|
||||
C.cuda_init(&resp)
|
||||
if resp.err != nil {
|
||||
log.Printf("CUDA not detected: %s", C.GoString(resp.err))
|
||||
C.free(unsafe.Pointer(resp.err))
|
||||
|
||||
slog.Info("Detecting GPU type")
|
||||
cudaLibPaths := FindGPULibs(cudaMgmtName, cudaMgmtPatterns)
|
||||
if len(cudaLibPaths) > 0 {
|
||||
cuda := LoadCUDAMgmt(cudaLibPaths)
|
||||
if cuda != nil {
|
||||
slog.Info("Nvidia GPU detected")
|
||||
gpuHandles.cuda = cuda
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
rocmLibPaths := FindGPULibs(rocmMgmtName, rocmMgmtPatterns)
|
||||
if len(rocmLibPaths) > 0 {
|
||||
rocm := LoadROCMMgmt(rocmLibPaths)
|
||||
if rocm != nil {
|
||||
slog.Info("Radeon GPU detected")
|
||||
gpuHandles.rocm = rocm
|
||||
return
|
||||
var resp C.rocm_init_resp_t
|
||||
C.rocm_init(&resp)
|
||||
if resp.err != nil {
|
||||
log.Printf("ROCm not detected: %s", C.GoString(resp.err))
|
||||
C.free(unsafe.Pointer(resp.err))
|
||||
} else {
|
||||
log.Printf("Radeon GPU detected")
|
||||
rocm := resp.rh
|
||||
gpuHandles.rocm = &rocm
|
||||
}
|
||||
} else {
|
||||
log.Printf("Nvidia GPU detected")
|
||||
cuda := resp.ch
|
||||
gpuHandles.cuda = &cuda
|
||||
}
|
||||
}
|
||||
|
||||
@@ -123,47 +71,42 @@ func GetGPUInfo() GpuInfo {
|
||||
if gpuHandles.cuda != nil {
|
||||
C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
|
||||
if memInfo.err != nil {
|
||||
slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)))
|
||||
log.Printf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err))
|
||||
C.free(unsafe.Pointer(memInfo.err))
|
||||
} else {
|
||||
// Verify minimum compute capability
|
||||
var cc C.cuda_compute_capability_t
|
||||
C.cuda_compute_capability(*gpuHandles.cuda, &cc)
|
||||
if cc.err != nil {
|
||||
slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err)))
|
||||
log.Printf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err))
|
||||
C.free(unsafe.Pointer(cc.err))
|
||||
} else if cc.major >= CudaComputeMajorMin {
|
||||
slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||
log.Printf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor)
|
||||
resp.Library = "cuda"
|
||||
} else {
|
||||
slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||
log.Printf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor)
|
||||
}
|
||||
}
|
||||
} else if gpuHandles.rocm != nil {
|
||||
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
|
||||
if memInfo.err != nil {
|
||||
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
|
||||
log.Printf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))
|
||||
C.free(unsafe.Pointer(memInfo.err))
|
||||
} else {
|
||||
resp.Library = "rocm"
|
||||
var version C.rocm_version_resp_t
|
||||
C.rocm_get_version(*gpuHandles.rocm, &version)
|
||||
verString := C.GoString(version.str)
|
||||
if version.status == 0 {
|
||||
resp.Variant = "v" + verString
|
||||
} else {
|
||||
slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
|
||||
}
|
||||
C.free(unsafe.Pointer(version.str))
|
||||
}
|
||||
}
|
||||
if resp.Library == "" {
|
||||
C.cpu_check_ram(&memInfo)
|
||||
resp.Library = "cpu"
|
||||
resp.Variant = GetCPUVariant()
|
||||
// In the future we may offer multiple CPU variants to tune CPU features
|
||||
if runtime.GOOS == "windows" {
|
||||
resp.Library = "cpu"
|
||||
} else {
|
||||
resp.Library = "default"
|
||||
}
|
||||
}
|
||||
if memInfo.err != nil {
|
||||
slog.Info(fmt.Sprintf("error looking up CPU memory: %s", C.GoString(memInfo.err)))
|
||||
log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err))
|
||||
C.free(unsafe.Pointer(memInfo.err))
|
||||
return resp
|
||||
}
|
||||
@@ -190,94 +133,13 @@ func getCPUMem() (memInfo, error) {
|
||||
func CheckVRAM() (int64, error) {
|
||||
gpuInfo := GetGPUInfo()
|
||||
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
|
||||
return int64(gpuInfo.FreeMemory), nil
|
||||
// leave 10% or 384Mi of VRAM free for unaccounted for overhead
|
||||
overhead := gpuInfo.FreeMemory * uint64(gpuInfo.DeviceCount) / 10
|
||||
if overhead < 384*1024*1024 {
|
||||
overhead = 384 * 1024 * 1024
|
||||
}
|
||||
return int64(gpuInfo.FreeMemory - overhead), nil
|
||||
}
|
||||
|
||||
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
|
||||
}
|
||||
|
||||
func FindGPULibs(baseLibName string, patterns []string) []string {
|
||||
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
||||
var ldPaths []string
|
||||
gpuLibPaths := []string{}
|
||||
slog.Info(fmt.Sprintf("Searching for GPU management library %s", baseLibName))
|
||||
|
||||
switch runtime.GOOS {
|
||||
case "windows":
|
||||
ldPaths = strings.Split(os.Getenv("PATH"), ";")
|
||||
case "linux":
|
||||
ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
|
||||
default:
|
||||
return gpuLibPaths
|
||||
}
|
||||
// Start with whatever we find in the PATH/LD_LIBRARY_PATH
|
||||
for _, ldPath := range ldPaths {
|
||||
d, err := filepath.Abs(ldPath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
|
||||
}
|
||||
slog.Debug(fmt.Sprintf("gpu management search paths: %v", patterns))
|
||||
for _, pattern := range patterns {
|
||||
// Ignore glob discovery errors
|
||||
matches, _ := filepath.Glob(pattern)
|
||||
for _, match := range matches {
|
||||
// Resolve any links so we don't try the same lib multiple times
|
||||
// and weed out any dups across globs
|
||||
libPath := match
|
||||
tmp := match
|
||||
var err error
|
||||
for ; err == nil; tmp, err = os.Readlink(libPath) {
|
||||
if !filepath.IsAbs(tmp) {
|
||||
tmp = filepath.Join(filepath.Dir(libPath), tmp)
|
||||
}
|
||||
libPath = tmp
|
||||
}
|
||||
new := true
|
||||
for _, cmp := range gpuLibPaths {
|
||||
if cmp == libPath {
|
||||
new = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if new {
|
||||
gpuLibPaths = append(gpuLibPaths, libPath)
|
||||
}
|
||||
}
|
||||
}
|
||||
slog.Info(fmt.Sprintf("Discovered GPU libraries: %v", gpuLibPaths))
|
||||
return gpuLibPaths
|
||||
}
|
||||
|
||||
func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
|
||||
var resp C.cuda_init_resp_t
|
||||
for _, libPath := range cudaLibPaths {
|
||||
lib := C.CString(libPath)
|
||||
defer C.free(unsafe.Pointer(lib))
|
||||
C.cuda_init(lib, &resp)
|
||||
if resp.err != nil {
|
||||
slog.Info(fmt.Sprintf("Unable to load CUDA management library %s: %s", libPath, C.GoString(resp.err)))
|
||||
C.free(unsafe.Pointer(resp.err))
|
||||
} else {
|
||||
return &resp.ch
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
|
||||
var resp C.rocm_init_resp_t
|
||||
for _, libPath := range rocmLibPaths {
|
||||
lib := C.CString(libPath)
|
||||
defer C.free(unsafe.Pointer(lib))
|
||||
C.rocm_init(lib, &resp)
|
||||
if resp.err != nil {
|
||||
slog.Info(fmt.Sprintf("Unable to load ROCm management library %s: %s", libPath, C.GoString(resp.err)))
|
||||
C.free(unsafe.Pointer(resp.err))
|
||||
} else {
|
||||
return &resp.rh
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -32,15 +32,8 @@ func CheckVRAM() (int64, error) {
|
||||
|
||||
func GetGPUInfo() GpuInfo {
|
||||
mem, _ := getCPUMem()
|
||||
if runtime.GOARCH == "amd64" {
|
||||
return GpuInfo{
|
||||
Library: "cpu",
|
||||
Variant: GetCPUVariant(),
|
||||
memInfo: mem,
|
||||
}
|
||||
}
|
||||
return GpuInfo{
|
||||
Library: "metal",
|
||||
Library: "default",
|
||||
memInfo: mem,
|
||||
}
|
||||
}
|
||||
@@ -52,3 +45,7 @@ func getCPUMem() (memInfo, error) {
|
||||
DeviceCount: 0,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func nativeInit() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -4,9 +4,33 @@
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#ifndef _WIN32
|
||||
const char *cuda_lib_paths[] = {
|
||||
"libnvidia-ml.so",
|
||||
"/usr/lib/wsl/lib/libnvidia-ml.so", // TODO Maybe glob?
|
||||
"/usr/lib/wsl/lib/libnvidia-ml.so.1",
|
||||
"/usr/local/cuda/lib64/libnvidia-ml.so",
|
||||
"/usr/lib/libnvidia-ml.so",
|
||||
"/usr/lib/libnvidia-ml.so.1",
|
||||
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
|
||||
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so",
|
||||
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1",
|
||||
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so",
|
||||
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so",
|
||||
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.1",
|
||||
NULL,
|
||||
};
|
||||
#else
|
||||
const char *cuda_lib_paths[] = {
|
||||
"nvml.dll",
|
||||
"",
|
||||
NULL,
|
||||
};
|
||||
#endif
|
||||
|
||||
#define CUDA_LOOKUP_SIZE 6
|
||||
|
||||
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
|
||||
void cuda_init(cuda_init_resp_t *resp) {
|
||||
nvmlReturn_t ret;
|
||||
resp->err = NULL;
|
||||
const int buflen = 256;
|
||||
@@ -25,12 +49,16 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
|
||||
{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
|
||||
};
|
||||
|
||||
resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
|
||||
for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
|
||||
resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
|
||||
}
|
||||
if (!resp->ch.handle) {
|
||||
// TODO improve error message, as the LOAD_ERR will have typically have the
|
||||
// final path that was checked which might be confusing.
|
||||
char *msg = LOAD_ERR();
|
||||
snprintf(buf, buflen,
|
||||
"Unable to load %s library to query for Nvidia GPUs: %s",
|
||||
cuda_lib_path, msg);
|
||||
cuda_lib_paths[0], msg);
|
||||
free(msg);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
@@ -52,8 +80,6 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
|
||||
|
||||
ret = (*resp->ch.initFn)();
|
||||
if (ret != NVML_SUCCESS) {
|
||||
UNLOAD_LIBRARY(resp->ch.handle);
|
||||
resp->ch.handle = NULL;
|
||||
snprintf(buf, buflen, "nvml vram init failure: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ typedef struct cuda_compute_capability {
|
||||
int minor;
|
||||
} cuda_compute_capability_t;
|
||||
|
||||
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp);
|
||||
void cuda_init(cuda_init_resp_t *resp);
|
||||
void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
|
||||
void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc);
|
||||
|
||||
|
||||
@@ -4,9 +4,22 @@
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#define ROCM_LOOKUP_SIZE 5
|
||||
#ifndef _WIN32
|
||||
const char *rocm_lib_paths[] = {
|
||||
"librocm_smi64.so",
|
||||
"/opt/rocm/lib/librocm_smi64.so",
|
||||
NULL,
|
||||
};
|
||||
#else
|
||||
// TODO untested
|
||||
const char *rocm_lib_paths[] = {
|
||||
"rocm_smi64.dll",
|
||||
"/opt/rocm/lib/rocm_smi64.dll",
|
||||
NULL,
|
||||
};
|
||||
#endif
|
||||
|
||||
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
|
||||
void rocm_init(rocm_init_resp_t *resp) {
|
||||
rsmi_status_t ret;
|
||||
resp->err = NULL;
|
||||
const int buflen = 256;
|
||||
@@ -15,31 +28,31 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
|
||||
struct lookup {
|
||||
char *s;
|
||||
void **p;
|
||||
} l[ROCM_LOOKUP_SIZE] = {
|
||||
} l[4] = {
|
||||
{"rsmi_init", (void *)&resp->rh.initFn},
|
||||
{"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
|
||||
{"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
|
||||
{"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
|
||||
{"rsmi_version_get", (void *)&resp->rh.versionGetFn},
|
||||
// { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
|
||||
};
|
||||
|
||||
resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
|
||||
for (i = 0; rocm_lib_paths[i] != NULL && resp->rh.handle == NULL; i++) {
|
||||
resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY);
|
||||
}
|
||||
if (!resp->rh.handle) {
|
||||
char *msg = LOAD_ERR();
|
||||
snprintf(buf, buflen,
|
||||
"Unable to load %s library to query for Radeon GPUs: %s\n",
|
||||
rocm_lib_path, msg);
|
||||
rocm_lib_paths[0], msg);
|
||||
free(msg);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 0; i < ROCM_LOOKUP_SIZE; i++) {
|
||||
for (i = 0; i < 4; i++) {
|
||||
*l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
|
||||
if (!l[i].p) {
|
||||
UNLOAD_LIBRARY(resp->rh.handle);
|
||||
resp->rh.handle = NULL;
|
||||
char *msg = LOAD_ERR();
|
||||
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
|
||||
msg);
|
||||
@@ -51,8 +64,6 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
|
||||
|
||||
ret = (*resp->rh.initFn)(0);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
UNLOAD_LIBRARY(resp->rh.handle);
|
||||
resp->rh.handle = NULL;
|
||||
snprintf(buf, buflen, "rocm vram init failure: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
}
|
||||
@@ -72,7 +83,7 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
|
||||
int i;
|
||||
|
||||
if (h.handle == NULL) {
|
||||
resp->err = strdup("rocm handle not initialized");
|
||||
resp->err = strdup("nvml handle sn't initialized");
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -106,25 +117,4 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
|
||||
return;
|
||||
}
|
||||
|
||||
void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
|
||||
const int buflen = 256;
|
||||
char buf[buflen + 1];
|
||||
if (h.handle == NULL) {
|
||||
resp->str = strdup("nvml handle not initialized");
|
||||
resp->status = 1;
|
||||
return;
|
||||
}
|
||||
rsmi_version_t ver;
|
||||
rsmi_status_t ret;
|
||||
ret = h.versionGetFn(&ver);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
|
||||
resp->status = 1;
|
||||
} else {
|
||||
snprintf(buf, buflen, "%d", ver.major);
|
||||
resp->status = 0;
|
||||
}
|
||||
resp->str = strdup(buf);
|
||||
}
|
||||
|
||||
#endif // __APPLE__
|
||||
@@ -15,20 +15,12 @@ typedef enum rsmi_memory_type {
|
||||
RSMI_MEM_TYPE_GTT,
|
||||
} rsmi_memory_type_t;
|
||||
|
||||
typedef struct {
|
||||
uint32_t major;
|
||||
uint32_t minor;
|
||||
uint32_t patch;
|
||||
const char *build;
|
||||
} rsmi_version_t;
|
||||
|
||||
typedef struct rocm_handle {
|
||||
void *handle;
|
||||
rsmi_status_t (*initFn)(uint64_t);
|
||||
rsmi_status_t (*shutdownFn)(void);
|
||||
rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
|
||||
rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
|
||||
rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
|
||||
// rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
|
||||
} rocm_handle_t;
|
||||
|
||||
@@ -37,14 +29,8 @@ typedef struct rocm_init_resp {
|
||||
rocm_handle_t rh;
|
||||
} rocm_init_resp_t;
|
||||
|
||||
typedef struct rocm_version_resp {
|
||||
rsmi_status_t status;
|
||||
char *str; // Contains version or error string if status != 0
|
||||
} rocm_version_resp_t;
|
||||
|
||||
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp);
|
||||
void rocm_init(rocm_init_resp_t *resp);
|
||||
void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
|
||||
void rocm_get_version(rocm_handle_t rh, rocm_version_resp_t *resp);
|
||||
|
||||
#endif // __GPU_INFO_ROCM_H__
|
||||
#endif // __APPLE__
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
|
||||
func TestBasicGetGPUInfo(t *testing.T) {
|
||||
info := GetGPUInfo()
|
||||
assert.Contains(t, "cuda rocm cpu metal", info.Library)
|
||||
assert.Contains(t, "cuda rocm cpu default", info.Library)
|
||||
|
||||
switch runtime.GOOS {
|
||||
case "darwin":
|
||||
@@ -18,7 +18,7 @@ func TestBasicGetGPUInfo(t *testing.T) {
|
||||
case "linux", "windows":
|
||||
assert.Greater(t, info.TotalMemory, uint64(0))
|
||||
assert.Greater(t, info.FreeMemory, uint64(0))
|
||||
assert.Greater(t, info.DeviceCount, uint32(0))
|
||||
assert.Greater(t, info.DeviceCount, uint64(0))
|
||||
default:
|
||||
return
|
||||
}
|
||||
|
||||
@@ -11,8 +11,5 @@ type GpuInfo struct {
|
||||
memInfo
|
||||
Library string `json:"library,omitempty"`
|
||||
|
||||
// Optional variant to select (e.g. versions, cpu feature flags)
|
||||
Variant string `json:"variant,omitempty"`
|
||||
|
||||
// TODO add other useful attributes about the card here for discovery information
|
||||
}
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#include "dyn_ext_server.h"
|
||||
#include "dynamic_shim.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __linux__
|
||||
#include <dlfcn.h>
|
||||
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
|
||||
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags | RTLD_DEEPBIND)
|
||||
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
|
||||
#define LOAD_ERR() strdup(dlerror())
|
||||
#define UNLOAD_LIBRARY(handle) dlclose(handle)
|
||||
@@ -33,7 +33,7 @@ inline char *LOAD_ERR() {
|
||||
#define UNLOAD_LIBRARY(handle) dlclose(handle)
|
||||
#endif
|
||||
|
||||
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
|
||||
void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
|
||||
ext_server_resp_t *err) {
|
||||
int i = 0;
|
||||
struct lookup {
|
||||
@@ -58,8 +58,8 @@ void dyn_init(const char *libPath, struct dynamic_llama_server *s,
|
||||
{"", NULL},
|
||||
};
|
||||
|
||||
printf("loading library %s\n", libPath);
|
||||
s->handle = LOAD_LIBRARY(libPath, RTLD_GLOBAL|RTLD_NOW);
|
||||
printf("Lazy loading %s library\n", libPath);
|
||||
s->handle = LOAD_LIBRARY(libPath, RTLD_NOW);
|
||||
if (!s->handle) {
|
||||
err->id = -1;
|
||||
char *msg = LOAD_ERR();
|
||||
@@ -83,63 +83,63 @@ void dyn_init(const char *libPath, struct dynamic_llama_server *s,
|
||||
}
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_init(struct dynamic_llama_server s,
|
||||
inline void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
|
||||
ext_server_params_t *sparams,
|
||||
ext_server_resp_t *err) {
|
||||
s.llama_server_init(sparams, err);
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_start(struct dynamic_llama_server s) {
|
||||
inline void dynamic_shim_llama_server_start(struct dynamic_llama_server s) {
|
||||
s.llama_server_start();
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
|
||||
inline void dynamic_shim_llama_server_stop(struct dynamic_llama_server s) {
|
||||
s.llama_server_stop();
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_completion(struct dynamic_llama_server s,
|
||||
inline void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
ext_server_resp_t *resp) {
|
||||
s.llama_server_completion(json_req, resp);
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_completion_next_result(
|
||||
inline void dynamic_shim_llama_server_completion_next_result(
|
||||
struct dynamic_llama_server s, const int task_id,
|
||||
ext_server_task_result_t *result) {
|
||||
s.llama_server_completion_next_result(task_id, result);
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_completion_cancel(
|
||||
inline void dynamic_shim_llama_server_completion_cancel(
|
||||
struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
|
||||
s.llama_server_completion_cancel(task_id, err);
|
||||
}
|
||||
inline void dyn_llama_server_release_task_result(
|
||||
inline void dynamic_shim_llama_server_release_task_result(
|
||||
struct dynamic_llama_server s, ext_server_task_result_t *result) {
|
||||
s.llama_server_release_task_result(result);
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
|
||||
inline void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
char **json_resp,
|
||||
ext_server_resp_t *err) {
|
||||
s.llama_server_tokenize(json_req, json_resp, err);
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
|
||||
inline void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
char **json_resp,
|
||||
ext_server_resp_t *err) {
|
||||
s.llama_server_detokenize(json_req, json_resp, err);
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
|
||||
inline void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
char **json_resp,
|
||||
ext_server_resp_t *err) {
|
||||
s.llama_server_embedding(json_req, json_resp, err);
|
||||
}
|
||||
|
||||
inline void dyn_llama_server_release_json_resp(
|
||||
inline void dynamic_shim_llama_server_release_json_resp(
|
||||
struct dynamic_llama_server s, char **json_resp) {
|
||||
s.llama_server_release_json_resp(json_resp);
|
||||
}
|
||||
@@ -27,46 +27,46 @@ struct dynamic_llama_server {
|
||||
void (*llama_server_release_json_resp)(char **json_resp);
|
||||
};
|
||||
|
||||
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
|
||||
void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
|
||||
ext_server_resp_t *err);
|
||||
|
||||
// No good way to call C function pointers from Go so inline the indirection
|
||||
void dyn_llama_server_init(struct dynamic_llama_server s,
|
||||
void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
|
||||
ext_server_params_t *sparams,
|
||||
ext_server_resp_t *err);
|
||||
|
||||
void dyn_llama_server_start(struct dynamic_llama_server s);
|
||||
void dynamic_shim_llama_server_start(struct dynamic_llama_server s);
|
||||
|
||||
void dyn_llama_server_stop(struct dynamic_llama_server s);
|
||||
void dynamic_shim_llama_server_stop(struct dynamic_llama_server s);
|
||||
|
||||
void dyn_llama_server_completion(struct dynamic_llama_server s,
|
||||
void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
ext_server_resp_t *resp);
|
||||
|
||||
void dyn_llama_server_completion_next_result(
|
||||
void dynamic_shim_llama_server_completion_next_result(
|
||||
struct dynamic_llama_server s, const int task_id,
|
||||
ext_server_task_result_t *result);
|
||||
|
||||
void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
|
||||
void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s,
|
||||
const int task_id,
|
||||
ext_server_resp_t *err);
|
||||
|
||||
void dyn_llama_server_release_task_result(
|
||||
void dynamic_shim_llama_server_release_task_result(
|
||||
struct dynamic_llama_server s, ext_server_task_result_t *result);
|
||||
|
||||
void dyn_llama_server_tokenize(struct dynamic_llama_server s,
|
||||
void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
|
||||
const char *json_req, char **json_resp,
|
||||
ext_server_resp_t *err);
|
||||
|
||||
void dyn_llama_server_detokenize(struct dynamic_llama_server s,
|
||||
void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
char **json_resp,
|
||||
ext_server_resp_t *err);
|
||||
|
||||
void dyn_llama_server_embedding(struct dynamic_llama_server s,
|
||||
void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
|
||||
const char *json_req, char **json_resp,
|
||||
ext_server_resp_t *err);
|
||||
void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
|
||||
void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s,
|
||||
char **json_resp);
|
||||
|
||||
#ifdef __cplusplus
|
||||
@@ -2,24 +2,28 @@
|
||||
|
||||
set(TARGET ext_server)
|
||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||
if (WIN32)
|
||||
add_library(${TARGET} SHARED ../../../ext_server/ext_server.cpp ../../llama.cpp)
|
||||
else()
|
||||
add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp ../../llama.cpp)
|
||||
endif()
|
||||
add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp)
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_include_directories(${TARGET} PRIVATE ../..)
|
||||
target_include_directories(${TARGET} PRIVATE ../../..)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
|
||||
target_link_libraries(${TARGET} PRIVATE ggml llava common )
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
|
||||
install(TARGETS ext_server LIBRARY)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_definitions(${TARGET} PRIVATE
|
||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(ext_server PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(ext_server PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
||||
add_library(ext_server_shared SHARED $<TARGET_OBJECTS:ext_server>)
|
||||
target_link_libraries(ext_server_shared PRIVATE ggml llama llava common ${CMAKE_THREAD_LIBS_INIT})
|
||||
install(TARGETS ext_server_shared LIBRARY)
|
||||
endif()
|
||||
|
||||
if (CUDAToolkit_FOUND)
|
||||
target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
|
||||
if (WIN32)
|
||||
target_link_libraries(${TARGET} PRIVATE nvml)
|
||||
target_link_libraries(ext_server_shared PRIVATE nvml)
|
||||
endif()
|
||||
endif()
|
||||
@@ -1,18 +1,4 @@
|
||||
# Extern C Server
|
||||
|
||||
This directory contains a thin facade we layer on top of the Llama.cpp server to
|
||||
expose `extern C` interfaces to access the functionality through direct API
|
||||
calls in-process. The llama.cpp code uses compile time macros to configure GPU
|
||||
type along with other settings. During the `go generate ./...` execution, the
|
||||
build will generate one or more copies of the llama.cpp `extern C` server based
|
||||
on what GPU libraries are detected to support multiple GPU types as well as CPU
|
||||
only support. The Ollama go build then embeds these different servers to support
|
||||
different GPUs and settings at runtime.
|
||||
|
||||
If you are making changes to the code in this directory, make sure to disable
|
||||
caching during your go build to ensure you pick up your changes. A typical
|
||||
iteration cycle from the top of the source tree looks like:
|
||||
|
||||
```
|
||||
go generate ./... && go build -a .
|
||||
```
|
||||
This directory contains a thin facade we layer on top of the Llama.cpp server
|
||||
to expose `extern C` interfaces to access the functionality through direct API calls in-process
|
||||
|
||||
@@ -47,13 +47,9 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
|
||||
params.model = sparams->model;
|
||||
}
|
||||
|
||||
if (sparams->lora_adapters != NULL) {
|
||||
for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
|
||||
la = la->next) {
|
||||
params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
|
||||
}
|
||||
|
||||
params.use_mmap = false;
|
||||
for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
|
||||
la = la->next) {
|
||||
params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
|
||||
}
|
||||
|
||||
if (sparams->mmproj != NULL) {
|
||||
@@ -115,10 +111,6 @@ void llama_server_stop() {
|
||||
// TODO - too verbose, remove once things are solid
|
||||
LOG_TEE("requesting llama server shutdown\n");
|
||||
ext_server_running = false;
|
||||
|
||||
// unblocks the update_slots() loop so it can clean up and exit
|
||||
llama->request_cancel(0);
|
||||
|
||||
ext_server_thread.join();
|
||||
delete llama;
|
||||
llama = NULL;
|
||||
|
||||
@@ -10,25 +10,31 @@ package llm
|
||||
#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
|
||||
#cgo darwin LDFLAGS: -lc++ -framework Accelerate
|
||||
#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libcommon.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libext_server.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libllama.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libggml_static.a
|
||||
#cgo linux CFLAGS: -D_GNU_SOURCE
|
||||
#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
|
||||
#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libext_server.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libcommon.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libllama.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libggml_static.a
|
||||
#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
|
||||
#cgo linux windows LDFLAGS: -lpthread
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "dyn_ext_server.h"
|
||||
#include "ext_server.h"
|
||||
|
||||
*/
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"log"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -37,9 +43,19 @@ import (
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
type dynExtServer struct {
|
||||
s C.struct_dynamic_llama_server
|
||||
options api.Options
|
||||
type extServer interface {
|
||||
LLM
|
||||
llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
|
||||
llama_server_start()
|
||||
llama_server_stop()
|
||||
llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t)
|
||||
llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t)
|
||||
llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t)
|
||||
llama_server_release_task_result(result *C.ext_server_task_result_t)
|
||||
llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
|
||||
llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
|
||||
llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
|
||||
llama_server_release_json_resp(json_resp **C.char)
|
||||
}
|
||||
|
||||
// Note: current implementation does not support concurrent instantiations
|
||||
@@ -64,30 +80,11 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
|
||||
return fmt.Errorf(C.GoString(resp.msg))
|
||||
}
|
||||
|
||||
// Note: current implementation does not support concurrent instantiations
|
||||
var llm *dynExtServer
|
||||
|
||||
func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
||||
func newExtServer(server extServer, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
if !mutex.TryLock() {
|
||||
slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
|
||||
log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
|
||||
mutex.Lock()
|
||||
}
|
||||
updatePath(filepath.Dir(library))
|
||||
libPath := C.CString(library)
|
||||
defer C.free(unsafe.Pointer(libPath))
|
||||
resp := newExtServerResp(512)
|
||||
defer freeExtServerResp(resp)
|
||||
var srv C.struct_dynamic_llama_server
|
||||
C.dyn_init(libPath, &srv, &resp)
|
||||
if resp.id < 0 {
|
||||
mutex.Unlock()
|
||||
return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
|
||||
}
|
||||
llm = &dynExtServer{
|
||||
s: srv,
|
||||
options: opts,
|
||||
}
|
||||
slog.Info(fmt.Sprintf("Loading Dynamic llm server: %s", library))
|
||||
|
||||
var sparams C.ext_server_params_t
|
||||
sparams.model = C.CString(model)
|
||||
@@ -136,20 +133,20 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
|
||||
|
||||
sparams.n_threads = C.uint(opts.NumThread)
|
||||
|
||||
slog.Info("Initializing llama server")
|
||||
initResp := newExtServerResp(128)
|
||||
defer freeExtServerResp(initResp)
|
||||
C.dyn_llama_server_init(llm.s, &sparams, &initResp)
|
||||
if initResp.id < 0 {
|
||||
return nil, extServerResponseToErr(initResp)
|
||||
log.Printf("Initializing internal llama server")
|
||||
resp := newExtServerResp(128)
|
||||
defer freeExtServerResp(resp)
|
||||
server.llama_server_init(&sparams, &resp)
|
||||
if resp.id < 0 {
|
||||
return nil, extServerResponseToErr(resp)
|
||||
}
|
||||
|
||||
slog.Info("Starting llama main loop")
|
||||
C.dyn_llama_server_start(llm.s)
|
||||
return llm, nil
|
||||
log.Printf("Starting internal llama main loop")
|
||||
server.llama_server_start()
|
||||
return server, nil
|
||||
}
|
||||
|
||||
func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
|
||||
func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(PredictResult)) error {
|
||||
resp := newExtServerResp(128)
|
||||
defer freeExtServerResp(resp)
|
||||
var imageData []ImageData
|
||||
@@ -158,7 +155,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
|
||||
imageData = append(imageData, ImageData{Data: i, ID: cnt})
|
||||
}
|
||||
}
|
||||
slog.Info(fmt.Sprintf("loaded %d images", len(imageData)))
|
||||
log.Printf("loaded %d images", len(imageData))
|
||||
|
||||
request := map[string]any{
|
||||
"prompt": predict.Prompt,
|
||||
@@ -181,6 +178,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
|
||||
"seed": predict.Options.Seed,
|
||||
"stop": predict.Options.Stop,
|
||||
"image_data": imageData,
|
||||
"cache_prompt": true,
|
||||
}
|
||||
|
||||
if predict.Format == "json" {
|
||||
@@ -206,7 +204,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
|
||||
req := C.CString(buffer.String())
|
||||
defer C.free(unsafe.Pointer(req))
|
||||
|
||||
C.dyn_llama_server_completion(llm.s, req, &resp)
|
||||
llm.llama_server_completion(req, &resp)
|
||||
if resp.id < 0 {
|
||||
return extServerResponseToErr(resp)
|
||||
}
|
||||
@@ -217,7 +215,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
// This handles the request cancellation
|
||||
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
|
||||
llm.llama_server_completion_cancel(resp.id, &resp)
|
||||
if resp.id < 0 {
|
||||
return extServerResponseToErr(resp)
|
||||
} else {
|
||||
@@ -225,13 +223,13 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
|
||||
}
|
||||
default:
|
||||
var result C.ext_server_task_result_t
|
||||
C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
|
||||
llm.llama_server_completion_next_result(resp.id, &result)
|
||||
json_resp := C.GoString(result.json_resp)
|
||||
C.dyn_llama_server_release_task_result(llm.s, &result)
|
||||
llm.llama_server_release_task_result(&result)
|
||||
|
||||
var p prediction
|
||||
if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
|
||||
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
|
||||
llm.llama_server_completion_cancel(resp.id, &resp)
|
||||
if resp.id < 0 {
|
||||
return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
|
||||
} else {
|
||||
@@ -272,7 +270,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
|
||||
return fmt.Errorf("max retries exceeded")
|
||||
}
|
||||
|
||||
func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
|
||||
func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
|
||||
data, err := json.Marshal(TokenizeRequest{Content: prompt})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshaling encode data: %w", err)
|
||||
@@ -282,11 +280,11 @@ func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, erro
|
||||
var json_resp *C.char
|
||||
resp := newExtServerResp(128)
|
||||
defer freeExtServerResp(resp)
|
||||
C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
|
||||
llm.llama_server_tokenize(req, &json_resp, &resp)
|
||||
if resp.id < 0 {
|
||||
return nil, extServerResponseToErr(resp)
|
||||
}
|
||||
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
|
||||
defer llm.llama_server_release_json_resp(&json_resp)
|
||||
|
||||
var encoded TokenizeResponse
|
||||
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
|
||||
@@ -296,7 +294,7 @@ func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, erro
|
||||
return encoded.Tokens, err
|
||||
}
|
||||
|
||||
func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
|
||||
func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
|
||||
if len(tokens) == 0 {
|
||||
return "", nil
|
||||
}
|
||||
@@ -310,11 +308,11 @@ func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, erro
|
||||
var json_resp *C.char
|
||||
resp := newExtServerResp(128)
|
||||
defer freeExtServerResp(resp)
|
||||
C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
|
||||
llm.llama_server_detokenize(req, &json_resp, &resp)
|
||||
if resp.id < 0 {
|
||||
return "", extServerResponseToErr(resp)
|
||||
}
|
||||
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
|
||||
defer llm.llama_server_release_json_resp(&json_resp)
|
||||
|
||||
var decoded DetokenizeResponse
|
||||
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
|
||||
@@ -324,7 +322,7 @@ func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, erro
|
||||
return decoded.Content, err
|
||||
}
|
||||
|
||||
func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
|
||||
func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
|
||||
data, err := json.Marshal(TokenizeRequest{Content: input})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error marshaling embed data: %w", err)
|
||||
@@ -335,11 +333,11 @@ func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64
|
||||
var json_resp *C.char
|
||||
resp := newExtServerResp(128)
|
||||
defer freeExtServerResp(resp)
|
||||
C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
|
||||
llm.llama_server_embedding(req, &json_resp, &resp)
|
||||
if resp.id < 0 {
|
||||
return nil, extServerResponseToErr(resp)
|
||||
}
|
||||
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
|
||||
defer llm.llama_server_release_json_resp(&json_resp)
|
||||
|
||||
var embedding EmbeddingResponse
|
||||
if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
|
||||
@@ -349,29 +347,7 @@ func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64
|
||||
return embedding.Embedding, nil
|
||||
}
|
||||
|
||||
func (llm *dynExtServer) Close() {
|
||||
C.dyn_llama_server_stop(llm.s)
|
||||
func close(llm extServer) {
|
||||
llm.llama_server_stop()
|
||||
mutex.Unlock()
|
||||
}
|
||||
|
||||
func updatePath(dir string) {
|
||||
if runtime.GOOS == "windows" {
|
||||
tmpDir := filepath.Dir(dir)
|
||||
pathComponents := strings.Split(os.Getenv("PATH"), ";")
|
||||
i := 0
|
||||
for _, comp := range pathComponents {
|
||||
if strings.EqualFold(comp, dir) {
|
||||
return
|
||||
}
|
||||
// Remove any other prior paths to our temp dir
|
||||
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
|
||||
pathComponents[i] = comp
|
||||
i++
|
||||
}
|
||||
}
|
||||
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
|
||||
slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
|
||||
os.Setenv("PATH", newPath)
|
||||
}
|
||||
// linux and darwin rely on rpath
|
||||
}
|
||||
80
llm/ext_server_default.go
Normal file
80
llm/ext_server_default.go
Normal file
@@ -0,0 +1,80 @@
|
||||
//go:build !windows
|
||||
|
||||
package llm
|
||||
|
||||
/*
|
||||
#include <stdlib.h>
|
||||
#include "ext_server.h"
|
||||
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
type llamaExtServer struct {
|
||||
api.Options
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
|
||||
C.llama_server_init(sparams, err)
|
||||
}
|
||||
func (llm *llamaExtServer) llama_server_start() {
|
||||
C.llama_server_start()
|
||||
}
|
||||
func (llm *llamaExtServer) llama_server_stop() {
|
||||
C.llama_server_stop()
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
|
||||
C.llama_server_completion(json_req, resp)
|
||||
}
|
||||
func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
|
||||
C.llama_server_completion_next_result(task_id, resp)
|
||||
}
|
||||
func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
|
||||
C.llama_server_completion_cancel(task_id, err)
|
||||
}
|
||||
func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
|
||||
C.llama_server_release_task_result(result)
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
||||
C.llama_server_tokenize(json_req, json_resp, err)
|
||||
}
|
||||
func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
||||
C.llama_server_detokenize(json_req, json_resp, err)
|
||||
}
|
||||
func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
||||
C.llama_server_embedding(json_req, json_resp, err)
|
||||
}
|
||||
func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
|
||||
C.llama_server_release_json_resp(json_resp)
|
||||
}
|
||||
|
||||
func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
server := &llamaExtServer{opts}
|
||||
return newExtServer(server, model, adapters, projectors, opts)
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
|
||||
return predict(ctx, llm, pred, fn)
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
|
||||
return encode(llm, ctx, prompt)
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
|
||||
return decode(llm, ctx, tokens)
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
|
||||
return embedding(llm, ctx, input)
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) Close() {
|
||||
close(llm)
|
||||
}
|
||||
12
llm/ext_server_windows.go
Normal file
12
llm/ext_server_windows.go
Normal file
@@ -0,0 +1,12 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
// On windows we always load the llama.cpp libraries dynamically to avoid startup DLL dependencies
|
||||
// This ensures we can update the PATH at runtime to get everything loaded
|
||||
|
||||
return newDynamicShimExtServer(AvailableShims["cpu"], model, adapters, projectors, opts)
|
||||
}
|
||||
@@ -1,44 +1,15 @@
|
||||
# common logic accross linux and darwin
|
||||
|
||||
init_vars() {
|
||||
case "${GOARCH}" in
|
||||
"amd64")
|
||||
ARCH="x86_64"
|
||||
;;
|
||||
"arm64")
|
||||
ARCH="arm64"
|
||||
;;
|
||||
*)
|
||||
ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
|
||||
esac
|
||||
|
||||
LLAMACPP_DIR=../llama.cpp
|
||||
CMAKE_DEFS=""
|
||||
CMAKE_TARGETS="--target ext_server"
|
||||
CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
|
||||
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
|
||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on"
|
||||
else
|
||||
# TODO - add additional optimization flags...
|
||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off"
|
||||
fi
|
||||
case $(uname -s) in
|
||||
"Darwin")
|
||||
LIB_EXT="dylib"
|
||||
WHOLE_ARCHIVE="-Wl,-force_load"
|
||||
NO_WHOLE_ARCHIVE=""
|
||||
GCC_ARCH="-arch ${ARCH}"
|
||||
;;
|
||||
"Linux")
|
||||
LIB_EXT="so"
|
||||
WHOLE_ARCHIVE="-Wl,--whole-archive"
|
||||
NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive"
|
||||
|
||||
# Cross compiling not supported on linux - Use docker
|
||||
GCC_ARCH=""
|
||||
;;
|
||||
*)
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
git_module_setup() {
|
||||
@@ -69,29 +40,15 @@ apply_patches() {
|
||||
build() {
|
||||
cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
|
||||
cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
|
||||
mkdir -p ${BUILD_DIR}/lib/
|
||||
g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
|
||||
${GCC_ARCH} \
|
||||
${WHOLE_ARCHIVE} ${BUILD_DIR}/examples/server/libext_server.a ${NO_WHOLE_ARCHIVE} \
|
||||
${BUILD_DIR}/common/libcommon.a \
|
||||
${BUILD_DIR}/libllama.a \
|
||||
-Wl,-rpath,\$ORIGIN \
|
||||
-lpthread -ldl -lm \
|
||||
${EXTRA_LIBS}
|
||||
}
|
||||
|
||||
compress_libs() {
|
||||
echo "Compressing payloads to reduce overall binary size..."
|
||||
pids=""
|
||||
for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
|
||||
gzip --best ${lib} &
|
||||
pids+=" $!"
|
||||
done
|
||||
echo
|
||||
for pid in ${pids}; do
|
||||
wait $pid
|
||||
done
|
||||
echo "Finished compression"
|
||||
install() {
|
||||
rm -rf ${BUILD_DIR}/lib
|
||||
mkdir -p ${BUILD_DIR}/lib
|
||||
cp ${BUILD_DIR}/examples/server/libext_server.a ${BUILD_DIR}/lib
|
||||
cp ${BUILD_DIR}/common/libcommon.a ${BUILD_DIR}/lib
|
||||
cp ${BUILD_DIR}/libllama.a ${BUILD_DIR}/lib
|
||||
cp ${BUILD_DIR}/libggml_static.a ${BUILD_DIR}/lib
|
||||
}
|
||||
|
||||
# Keep the local tree clean after we're done with the build
|
||||
|
||||
@@ -9,52 +9,14 @@ set -o pipefail
|
||||
echo "Starting darwin generate script"
|
||||
source $(dirname $0)/gen_common.sh
|
||||
init_vars
|
||||
git_module_setup
|
||||
apply_patches
|
||||
|
||||
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_ACCELERATE=off"
|
||||
|
||||
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/metal"
|
||||
case "${GOARCH}" in
|
||||
"amd64")
|
||||
COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
|
||||
|
||||
#
|
||||
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
||||
#
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
|
||||
echo "Building LCD CPU"
|
||||
build
|
||||
compress_libs
|
||||
|
||||
#
|
||||
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
|
||||
# Approximately 400% faster than LCD on same CPU
|
||||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
|
||||
echo "Building AVX CPU"
|
||||
build
|
||||
compress_libs
|
||||
|
||||
#
|
||||
# ~2013 CPU Dynamic library
|
||||
# Approximately 10% faster than AVX on same CPU
|
||||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
|
||||
echo "Building AVX2 CPU"
|
||||
build
|
||||
compress_libs
|
||||
CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
;;
|
||||
"arm64")
|
||||
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
|
||||
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
|
||||
build
|
||||
compress_libs
|
||||
CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DLLAMA_METAL=on ${CMAKE_DEFS}"
|
||||
;;
|
||||
*)
|
||||
echo "GOARCH must be set"
|
||||
@@ -63,4 +25,8 @@ case "${GOARCH}" in
|
||||
;;
|
||||
esac
|
||||
|
||||
git_module_setup
|
||||
apply_patches
|
||||
build
|
||||
install
|
||||
cleanup
|
||||
|
||||
@@ -2,14 +2,16 @@
|
||||
# This script is intended to run inside the go generate
|
||||
# working directory must be llm/generate/
|
||||
|
||||
# First we build one or more CPU based LLM libraries
|
||||
# First we build our default built-in library which will be linked into the CGO
|
||||
# binary as a normal dependency. This default build is CPU based.
|
||||
#
|
||||
# Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
|
||||
# library dependencies
|
||||
# Then we build a CUDA dynamic library (although statically linked with the CUDA
|
||||
# library dependencies for maximum portability)
|
||||
#
|
||||
# Then if we detect ROCm, we build a dynamically loaded ROCm lib. The ROCM
|
||||
# libraries are quite large, and also dynamically load data files at runtime
|
||||
# which in turn are large, so we don't attempt to cary them as payload
|
||||
# Then if we detect ROCm, we build a dynamically loaded ROCm lib. ROCm is particularly
|
||||
# important to be a dynamic lib even if it's the only GPU library detected because
|
||||
# we can't redistribute the objectfiles but must rely on dynamic libraries at
|
||||
# runtime, which could lead the server not to start if not present.
|
||||
|
||||
set -ex
|
||||
set -o pipefail
|
||||
@@ -37,13 +39,8 @@ amdGPUs() {
|
||||
}
|
||||
|
||||
echo "Starting linux generate script"
|
||||
if [ -z "${CUDACXX}" ]; then
|
||||
if [ -x /usr/local/cuda/bin/nvcc ]; then
|
||||
export CUDACXX=/usr/local/cuda/bin/nvcc
|
||||
else
|
||||
# Try the default location in case it exists
|
||||
export CUDACXX=$(command -v nvcc)
|
||||
fi
|
||||
if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ]; then
|
||||
export CUDACXX=/usr/local/cuda/bin/nvcc
|
||||
fi
|
||||
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
|
||||
source $(dirname $0)/gen_common.sh
|
||||
@@ -51,102 +48,38 @@ init_vars
|
||||
git_module_setup
|
||||
apply_patches
|
||||
|
||||
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||
# Users building from source can tune the exact flags we pass to cmake for configuring
|
||||
# llama.cpp, and we'll build only 1 CPU variant in that case as the default.
|
||||
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
|
||||
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
|
||||
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
|
||||
echo "Building custom CPU"
|
||||
build
|
||||
compress_libs
|
||||
else
|
||||
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
|
||||
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
|
||||
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
|
||||
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
|
||||
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
|
||||
# Note: the following seem to yield slower results than AVX2 - ymmv
|
||||
# -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
|
||||
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
|
||||
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
|
||||
#
|
||||
# CPU first for the default library
|
||||
#
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
|
||||
|
||||
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
|
||||
#
|
||||
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
||||
#
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
|
||||
echo "Building LCD CPU"
|
||||
build
|
||||
compress_libs
|
||||
build
|
||||
install
|
||||
|
||||
#
|
||||
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
|
||||
# Approximately 400% faster than LCD on same CPU
|
||||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
|
||||
echo "Building AVX CPU"
|
||||
build
|
||||
compress_libs
|
||||
# Placeholder to keep go embed happy until we start building dynamic CPU lib variants
|
||||
touch ${BUILD_DIR}/lib/dummy.so
|
||||
|
||||
#
|
||||
# ~2013 CPU Dynamic library
|
||||
# Approximately 10% faster than AVX on same CPU
|
||||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
|
||||
echo "Building AVX2 CPU"
|
||||
build
|
||||
compress_libs
|
||||
fi
|
||||
else
|
||||
echo "Skipping CPU generation step as requested"
|
||||
fi
|
||||
|
||||
# If needed, look for the default CUDA toolkit location
|
||||
if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then
|
||||
CUDA_LIB_DIR=/usr/local/cuda/lib64
|
||||
fi
|
||||
|
||||
# If needed, look for CUDA on Arch Linux
|
||||
if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
|
||||
CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
|
||||
fi
|
||||
|
||||
if [ -d "${CUDA_LIB_DIR}" ]; then
|
||||
if [ -d /usr/local/cuda/lib64/ ]; then
|
||||
echo "CUDA libraries detected - building dynamic CUDA library"
|
||||
init_vars
|
||||
CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
|
||||
if [ -n "${CUDA_MAJOR}" ]; then
|
||||
CUDA_VARIANT=_v${CUDA_MAJOR}
|
||||
fi
|
||||
CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
||||
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda"
|
||||
CUDA_LIB_DIR=/usr/local/cuda/lib64
|
||||
build
|
||||
|
||||
# Cary the CUDA libs as payloads to help reduce dependency burden on users
|
||||
#
|
||||
# TODO - in the future we may shift to packaging these separately and conditionally
|
||||
# downloading them in the install script.
|
||||
DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )"
|
||||
for lib in libcudart.so libcublas.so libcublasLt.so ; do
|
||||
DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
|
||||
if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
|
||||
cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
|
||||
elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
|
||||
cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
|
||||
else
|
||||
cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
|
||||
fi
|
||||
done
|
||||
compress_libs
|
||||
|
||||
install
|
||||
gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
|
||||
-Wl,--whole-archive \
|
||||
${BUILD_DIR}/lib/libext_server.a \
|
||||
${BUILD_DIR}/lib/libcommon.a \
|
||||
${BUILD_DIR}/lib/libllama.a \
|
||||
-Wl,--no-whole-archive \
|
||||
${CUDA_LIB_DIR}/libcudart_static.a \
|
||||
${CUDA_LIB_DIR}/libcublas_static.a \
|
||||
${CUDA_LIB_DIR}/libcublasLt_static.a \
|
||||
${CUDA_LIB_DIR}/libcudadevrt.a \
|
||||
${CUDA_LIB_DIR}/libculibos.a \
|
||||
-lrt -lpthread -ldl -lstdc++ -lm
|
||||
fi
|
||||
|
||||
if [ -z "${ROCM_PATH}" ]; then
|
||||
@@ -163,18 +96,21 @@ fi
|
||||
|
||||
if [ -d "${ROCM_PATH}" ]; then
|
||||
echo "ROCm libraries detected - building dynamic ROCm library"
|
||||
if [ -f ${ROCM_PATH}/lib/librocm_smi64.so.? ]; then
|
||||
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocm_smi64.so.? | cut -f3 -d. || true)
|
||||
fi
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
|
||||
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,${ROCM_PATH}/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm"
|
||||
build
|
||||
|
||||
# Note: the ROCM libs and runtime library files are too large to embed, so we depend on
|
||||
# them being present at runtime on the host
|
||||
compress_libs
|
||||
install
|
||||
gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
|
||||
-Wl,--whole-archive \
|
||||
${BUILD_DIR}/lib/libext_server.a \
|
||||
${BUILD_DIR}/lib/libcommon.a \
|
||||
${BUILD_DIR}/lib/libllama.a \
|
||||
-Wl,--no-whole-archive \
|
||||
-lrt -lpthread -ldl -lstdc++ -lm \
|
||||
-L/opt/rocm/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ \
|
||||
-Wl,-rpath,/opt/rocm/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ \
|
||||
-lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu
|
||||
fi
|
||||
|
||||
cleanup
|
||||
|
||||
@@ -4,9 +4,8 @@ $ErrorActionPreference = "Stop"
|
||||
|
||||
function init_vars {
|
||||
$script:llamacppDir = "../llama.cpp"
|
||||
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-A","x64")
|
||||
$script:cmakeTargets = @("ext_server")
|
||||
$script:ARCH = "amd64" # arm not yet supported.
|
||||
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-A","x64")
|
||||
$script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static")
|
||||
if ($env:CGO_CFLAGS -contains "-g") {
|
||||
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
|
||||
$script:config = "RelWithDebInfo"
|
||||
@@ -14,17 +13,6 @@ function init_vars {
|
||||
$script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off")
|
||||
$script:config = "Release"
|
||||
}
|
||||
# Try to find the CUDA dir
|
||||
if ($env:CUDA_LIB_DIR -eq $null) {
|
||||
$d=(get-command -ea 'silentlycontinue' nvcc).path
|
||||
if ($d -ne $null) {
|
||||
$script:CUDA_LIB_DIR=($d| split-path -parent)
|
||||
}
|
||||
} else {
|
||||
$script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
|
||||
}
|
||||
$script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
|
||||
$script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
|
||||
}
|
||||
|
||||
function git_module_setup {
|
||||
@@ -59,25 +47,11 @@ function build {
|
||||
function install {
|
||||
rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
|
||||
md "${script:buildDir}/lib" -ea 0 > $null
|
||||
cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
|
||||
cp "${script:buildDir}/bin/${script:config}/ext_server_shared.dll" "${script:buildDir}/lib"
|
||||
cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
|
||||
|
||||
# Display the dll dependencies in the build log
|
||||
if ($script:DUMPBIN -ne $null) {
|
||||
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
|
||||
}
|
||||
}
|
||||
|
||||
function compress_libs {
|
||||
if ($script:GZIP -eq $null) {
|
||||
write-host "gzip not installed, not compressing files"
|
||||
return
|
||||
}
|
||||
write-host "Compressing dlls..."
|
||||
$libs = dir "${script:buildDir}/lib/*.dll"
|
||||
foreach ($file in $libs) {
|
||||
& "$script:GZIP" --best $file
|
||||
}
|
||||
dumpbin /dependents "${script:buildDir}/bin/${script:config}/ext_server_shared.dll" | select-string ".dll"
|
||||
}
|
||||
|
||||
function cleanup {
|
||||
@@ -89,55 +63,21 @@ init_vars
|
||||
git_module_setup
|
||||
apply_patches
|
||||
|
||||
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
|
||||
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
|
||||
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
|
||||
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
|
||||
# first build CPU based
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/cpu"
|
||||
|
||||
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on", "-DLLAMA_NATIVE=off")
|
||||
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
|
||||
write-host "Building LCD CPU"
|
||||
build
|
||||
install
|
||||
compress_libs
|
||||
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
|
||||
write-host "Building AVX CPU"
|
||||
# Then build cuda as a dynamically loaded library
|
||||
init_vars
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/cuda"
|
||||
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON")
|
||||
build
|
||||
install
|
||||
compress_libs
|
||||
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
|
||||
write-host "Building AVX2 CPU"
|
||||
build
|
||||
install
|
||||
compress_libs
|
||||
|
||||
if ($null -ne $script:CUDA_LIB_DIR) {
|
||||
# Then build cuda as a dynamically loaded library
|
||||
$nvcc = (get-command -ea 'silentlycontinue' nvcc)
|
||||
if ($null -ne $nvcc) {
|
||||
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
|
||||
}
|
||||
if ($null -ne $script:CUDA_VERSION) {
|
||||
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
|
||||
}
|
||||
init_vars
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
|
||||
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
|
||||
build
|
||||
install
|
||||
cp "${script:CUDA_LIB_DIR}/cudart64_*.dll" "${script:buildDir}/lib"
|
||||
cp "${script:CUDA_LIB_DIR}/cublas64_*.dll" "${script:buildDir}/lib"
|
||||
cp "${script:CUDA_LIB_DIR}/cublasLt64_*.dll" "${script:buildDir}/lib"
|
||||
compress_libs
|
||||
}
|
||||
# TODO - actually implement ROCm support on windows
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm"
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/rocm"
|
||||
|
||||
rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
|
||||
md "${script:buildDir}/lib" -ea 0 > $null
|
||||
|
||||
@@ -83,7 +83,6 @@ type model interface {
|
||||
NumEmbed() uint32
|
||||
NumHead() uint32
|
||||
NumHeadKv() uint32
|
||||
NumCtx() uint32
|
||||
}
|
||||
|
||||
type container interface {
|
||||
@@ -99,9 +98,9 @@ func (c *containerLORA) Name() string {
|
||||
return "ggla"
|
||||
}
|
||||
|
||||
func (c *containerLORA) Decode(rso *readSeekOffset) (model, error) {
|
||||
func (c *containerLORA) Decode(ro *readSeekOffset) (model, error) {
|
||||
var version uint32
|
||||
binary.Read(rso, binary.LittleEndian, &version)
|
||||
binary.Read(ro, binary.LittleEndian, &version)
|
||||
|
||||
switch version {
|
||||
case 1:
|
||||
@@ -112,7 +111,7 @@ func (c *containerLORA) Decode(rso *readSeekOffset) (model, error) {
|
||||
c.version = version
|
||||
|
||||
// remaining file contents aren't decoded
|
||||
rso.Seek(0, io.SeekEnd)
|
||||
ro.Seek(0, io.SeekEnd)
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
@@ -308,15 +308,6 @@ func (llm *ggufModel) NumHeadKv() uint32 {
|
||||
return value.(uint32)
|
||||
}
|
||||
|
||||
func (llm *ggufModel) NumCtx() uint32 {
|
||||
value, exists := llm.kv[fmt.Sprintf("%s.context_length", llm.ModelFamily())]
|
||||
if !exists {
|
||||
return 0
|
||||
}
|
||||
|
||||
return value.(uint32)
|
||||
}
|
||||
|
||||
func (llm *ggufModel) NumGQA() uint32 {
|
||||
numHeadKv := llm.NumHeadKv()
|
||||
if numHeadKv == 0 {
|
||||
|
||||
Submodule llm/llama.cpp updated: 584d674be6...328b83de23
49
llm/llama.go
49
llm/llama.go
@@ -1,11 +1,17 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
_ "embed"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"time"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
"github.com/jmorganca/ollama/format"
|
||||
)
|
||||
|
||||
const jsonGrammar = `
|
||||
@@ -36,12 +42,51 @@ number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
|
||||
ws ::= ([ \t\n] ws)?
|
||||
`
|
||||
|
||||
type Running struct {
|
||||
Port int
|
||||
Cmd *exec.Cmd
|
||||
Cancel context.CancelFunc
|
||||
*StatusWriter // captures error messages from the llama runner process
|
||||
}
|
||||
|
||||
type ImageData struct {
|
||||
Data []byte `json:"data"`
|
||||
ID int `json:"id"`
|
||||
}
|
||||
|
||||
var payloadMissing = fmt.Errorf("expected dynamic library payloads not included in this build of ollama")
|
||||
var (
|
||||
errNvidiaSMI = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
|
||||
errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
|
||||
payloadMissing = fmt.Errorf("expected dynamic library payloads not included in this build of ollama")
|
||||
)
|
||||
|
||||
// StatusWriter is a writer that captures error messages from the llama runner process
|
||||
type StatusWriter struct {
|
||||
ErrCh chan error
|
||||
LastErrMsg string
|
||||
}
|
||||
|
||||
func NewStatusWriter() *StatusWriter {
|
||||
return &StatusWriter{
|
||||
ErrCh: make(chan error, 1),
|
||||
}
|
||||
}
|
||||
|
||||
func (w *StatusWriter) Write(b []byte) (int, error) {
|
||||
var errMsg string
|
||||
if _, after, ok := bytes.Cut(b, []byte("error:")); ok {
|
||||
errMsg = string(bytes.TrimSpace(after))
|
||||
} else if _, after, ok := bytes.Cut(b, []byte("CUDA error")); ok {
|
||||
errMsg = string(bytes.TrimSpace(after))
|
||||
}
|
||||
|
||||
if errMsg != "" {
|
||||
w.LastErrMsg = errMsg
|
||||
w.ErrCh <- fmt.Errorf("llama runner: %s", errMsg)
|
||||
}
|
||||
|
||||
return os.Stderr.Write(b)
|
||||
}
|
||||
|
||||
type prediction struct {
|
||||
Content string `json:"content"`
|
||||
@@ -57,7 +102,9 @@ type prediction struct {
|
||||
}
|
||||
}
|
||||
|
||||
const maxBufferSize = 512 * format.KiloByte
|
||||
const maxRetries = 3
|
||||
const retryDelay = 1 * time.Second
|
||||
|
||||
type PredictOpts struct {
|
||||
Prompt string
|
||||
|
||||
177
llm/llm.go
177
llm/llm.go
@@ -3,7 +3,7 @@ package llm
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"log"
|
||||
"os"
|
||||
"runtime"
|
||||
|
||||
@@ -19,6 +19,8 @@ type LLM interface {
|
||||
Close()
|
||||
}
|
||||
|
||||
var AvailableShims = map[string]string{}
|
||||
|
||||
func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
||||
if _, err := os.Stat(model); err != nil {
|
||||
return nil, err
|
||||
@@ -35,91 +37,95 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if opts.NumCtx > int(ggml.NumCtx()) {
|
||||
slog.Warn(fmt.Sprintf("requested context length is greater than model's max context length (%d > %d), using %d instead", opts.NumCtx, ggml.NumCtx(), ggml.NumCtx()))
|
||||
opts.NumCtx = int(ggml.NumCtx())
|
||||
}
|
||||
|
||||
if opts.NumCtx < 4 {
|
||||
opts.NumCtx = 4
|
||||
}
|
||||
|
||||
vram, _ := gpu.CheckVRAM()
|
||||
size := ggml.Size
|
||||
fmt.Println("size", ggml.Size)
|
||||
fmt.Println("filetype", ggml.FileType())
|
||||
fmt.Println("architecture", ggml.ModelFamily())
|
||||
fmt.Println("type", ggml.ModelType())
|
||||
fmt.Println("name", ggml.Name())
|
||||
fmt.Println("embd", ggml.NumEmbed())
|
||||
fmt.Println("head", ggml.NumHead())
|
||||
fmt.Println("head_kv", ggml.NumHeadKv())
|
||||
fmt.Println("gqa", ggml.NumGQA())
|
||||
|
||||
available, _ := gpu.CheckVRAM()
|
||||
|
||||
// For now assume filesize = model size
|
||||
// TODO: use actual model size
|
||||
requiredModel := ggml.Size
|
||||
|
||||
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
|
||||
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
|
||||
requiredKv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
|
||||
|
||||
// rough estimation for scratch space based on context size, batch size and number of layers in the model
|
||||
// TODO: instead call llama.cpp's alloc functions to measure required memory
|
||||
// TODO: account for quantization levels
|
||||
scratch := 8*int64(opts.NumCtx)*int64(opts.NumBatch)*int64(ggml.NumLayers()) + 1536*1024*1024 // 1536MiB overhead
|
||||
// this amount is the overhead + tensors in memory
|
||||
// TODO: get this from the llama.cpp's graph calcluations instead of
|
||||
// estimating it's 1/6 * kv_cache_size * num_gqa
|
||||
requiredAlloc := int64(ggml.NumGQA()) * requiredKv / 6
|
||||
|
||||
requiredTotal := requiredModel + requiredKv + requiredAlloc
|
||||
|
||||
log.Println("system memory bytes:", available)
|
||||
log.Println("required model bytes:", requiredModel)
|
||||
log.Println("required kv bytes:", requiredKv)
|
||||
log.Println("required alloc bytes:", requiredAlloc)
|
||||
log.Println("required total bytes:", requiredTotal)
|
||||
|
||||
info := gpu.GetGPUInfo()
|
||||
switch runtime.GOOS {
|
||||
case "darwin":
|
||||
if opts.NumGPU == 0 {
|
||||
break
|
||||
}
|
||||
library := info.Library
|
||||
|
||||
if size+kv+scratch > vram {
|
||||
slog.Info("not enough vram available, falling back to CPU only")
|
||||
info.Library = "cpu"
|
||||
info.Variant = gpu.GetCPUVariant()
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
}
|
||||
|
||||
opts.NumGPU = 1
|
||||
default:
|
||||
if info.Library == "cpu" {
|
||||
slog.Info("GPU not available, falling back to CPU")
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
}
|
||||
|
||||
// don't use GPU at all if no layers are loaded
|
||||
if opts.NumGPU == 0 {
|
||||
info.Library = "cpu"
|
||||
info.Variant = gpu.GetCPUVariant()
|
||||
break
|
||||
}
|
||||
|
||||
// user-defined GPU count
|
||||
if opts.NumGPU != -1 {
|
||||
break
|
||||
}
|
||||
|
||||
// the "main" GPU needs the most memory and determines the limit
|
||||
// of how many layers can be loaded. It needs to fit:
|
||||
// 1. the full compute graph allocation for all devices (graph)
|
||||
// 2. the proportional kv cache for all devices (kv * % layers)
|
||||
// 3. the proportional model (size * % layers / # devices)
|
||||
// This estimates the number of layers
|
||||
maxlayers := int64(ggml.NumLayers()) + 1
|
||||
devices := int64(info.DeviceCount)
|
||||
avg := vram / devices
|
||||
layers := maxlayers * (avg - scratch) / (kv + size/devices)
|
||||
if layers > maxlayers {
|
||||
layers = maxlayers
|
||||
}
|
||||
|
||||
// 1 + 2 must fit on the main gpu
|
||||
min := scratch + kv*layers/maxlayers
|
||||
if layers <= 0 || min > avg {
|
||||
slog.Info("not enough vram available, falling back to CPU only")
|
||||
info.Library = "cpu"
|
||||
info.Variant = gpu.GetCPUVariant()
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
}
|
||||
|
||||
opts.NumGPU = int(layers)
|
||||
if opts.NumGPU == -1 {
|
||||
// default to offloading all layers
|
||||
opts.NumGPU = int(ggml.NumLayers()) + 1
|
||||
}
|
||||
|
||||
// decide how many layers to put on the GPU
|
||||
if opts.NumGPU > 0 {
|
||||
switch runtime.GOOS {
|
||||
case "darwin":
|
||||
if requiredTotal > available {
|
||||
log.Println("not enough vram available, falling back to CPU only")
|
||||
opts.NumGPU = 0
|
||||
}
|
||||
default:
|
||||
if library == "cpu" || library == "default" {
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
}
|
||||
|
||||
// no offloading required
|
||||
if requiredTotal <= available {
|
||||
break
|
||||
}
|
||||
|
||||
// requiredAlloc is always loaded for the CUDA runner, so don't load it if it won't fit
|
||||
if requiredAlloc > available {
|
||||
log.Printf("not enough vram available, falling back to CPU only")
|
||||
library = "cpu"
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
}
|
||||
|
||||
available -= requiredAlloc
|
||||
|
||||
// fill remaining vram with layers
|
||||
log.Println("splitting", available, "of available memory bytes into layers")
|
||||
bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers()))
|
||||
log.Println("bytes per layer:", bytesPerLayer)
|
||||
layers := available / bytesPerLayer
|
||||
log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer))
|
||||
if layers < int64(opts.NumGPU) {
|
||||
opts.NumGPU = int(layers)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
opts.NumGQA = 0
|
||||
opts.RopeFrequencyBase = 0.0
|
||||
opts.RopeFrequencyScale = 0.0
|
||||
return newLlmServer(info, model, adapters, projectors, opts)
|
||||
return newLlmServer(library, model, adapters, projectors, opts)
|
||||
}
|
||||
|
||||
// Give any native cgo implementations an opportunity to initialize
|
||||
@@ -127,30 +133,15 @@ func Init(workdir string) error {
|
||||
return nativeInit(workdir)
|
||||
}
|
||||
|
||||
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
||||
dynLibs := getDynLibs(gpuInfo)
|
||||
|
||||
// Check to see if the user has requested a specific library instead of auto-detecting
|
||||
demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
|
||||
if demandLib != "" {
|
||||
libPath := availableDynLibs[demandLib]
|
||||
if libPath == "" {
|
||||
slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
|
||||
} else {
|
||||
slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
|
||||
dynLibs = []string{libPath}
|
||||
}
|
||||
}
|
||||
|
||||
err2 := fmt.Errorf("unable to locate suitable llm library")
|
||||
for _, dynLib := range dynLibs {
|
||||
srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
|
||||
func newLlmServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
if _, libPresent := AvailableShims[library]; libPresent && library != "default" {
|
||||
srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, opts)
|
||||
if err == nil {
|
||||
return srv, nil
|
||||
}
|
||||
slog.Warn(fmt.Sprintf("Failed to load dynamic library %s %s", dynLib, err))
|
||||
err2 = err
|
||||
log.Printf("Failed to load dynamic library %s - falling back to CPU mode %s", library, err)
|
||||
// TODO - update some state to indicate we were unable to load the GPU library for future "info" ux
|
||||
}
|
||||
|
||||
return nil, err2
|
||||
return newDefaultExtServer(model, adapters, projectors, opts)
|
||||
}
|
||||
|
||||
@@ -1,283 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"compress/gzip"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/exp/slices"
|
||||
"golang.org/x/sync/errgroup"
|
||||
|
||||
"github.com/jmorganca/ollama/gpu"
|
||||
)
|
||||
|
||||
// Libraries names may contain an optional variant separated by '_'
|
||||
// For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2"
|
||||
// Any library without a variant is the lowest common denominator
|
||||
var availableDynLibs = map[string]string{}
|
||||
|
||||
const pathComponentCount = 7
|
||||
|
||||
// getDynLibs returns an ordered list of LLM libraries to try, starting with the best
|
||||
func getDynLibs(gpuInfo gpu.GpuInfo) []string {
|
||||
// Short circuit if we know we're using the default built-in (darwin only)
|
||||
if gpuInfo.Library == "default" {
|
||||
return []string{"default"}
|
||||
}
|
||||
// TODO - temporary until we have multiple CPU variations for Darwin
|
||||
// Short circuit on darwin with metal only
|
||||
if len(availableDynLibs) == 1 {
|
||||
if _, onlyMetal := availableDynLibs["metal"]; onlyMetal {
|
||||
return []string{availableDynLibs["metal"]}
|
||||
}
|
||||
}
|
||||
|
||||
exactMatch := ""
|
||||
dynLibs := []string{}
|
||||
altDynLibs := []string{}
|
||||
requested := gpuInfo.Library
|
||||
if gpuInfo.Variant != "" {
|
||||
requested += "_" + gpuInfo.Variant
|
||||
}
|
||||
// Try to find an exact match
|
||||
for cmp := range availableDynLibs {
|
||||
if requested == cmp {
|
||||
exactMatch = cmp
|
||||
dynLibs = []string{availableDynLibs[cmp]}
|
||||
break
|
||||
}
|
||||
}
|
||||
// Then for GPUs load alternates and sort the list for consistent load ordering
|
||||
if gpuInfo.Library != "cpu" {
|
||||
for cmp := range availableDynLibs {
|
||||
if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
|
||||
altDynLibs = append(altDynLibs, cmp)
|
||||
}
|
||||
}
|
||||
slices.Sort(altDynLibs)
|
||||
for _, altDynLib := range altDynLibs {
|
||||
dynLibs = append(dynLibs, availableDynLibs[altDynLib])
|
||||
}
|
||||
}
|
||||
|
||||
// Load up the best CPU variant if not primary requested
|
||||
if gpuInfo.Library != "cpu" {
|
||||
variant := gpu.GetCPUVariant()
|
||||
// If no variant, then we fall back to default
|
||||
// If we have a variant, try that if we find an exact match
|
||||
// Attempting to run the wrong CPU instructions will panic the
|
||||
// process
|
||||
if variant != "" {
|
||||
for cmp := range availableDynLibs {
|
||||
if cmp == "cpu_"+variant {
|
||||
dynLibs = append(dynLibs, availableDynLibs[cmp])
|
||||
break
|
||||
}
|
||||
}
|
||||
} else {
|
||||
dynLibs = append(dynLibs, availableDynLibs["cpu"])
|
||||
}
|
||||
}
|
||||
|
||||
// Finally, if we didn't find any matches, LCD CPU FTW
|
||||
if len(dynLibs) == 0 {
|
||||
dynLibs = []string{availableDynLibs["cpu"]}
|
||||
}
|
||||
return dynLibs
|
||||
}
|
||||
|
||||
func rocmDynLibPresent() bool {
|
||||
for dynLibName := range availableDynLibs {
|
||||
if strings.HasPrefix(dynLibName, "rocm") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func nativeInit(workdir string) error {
|
||||
slog.Info("Extracting dynamic libraries...")
|
||||
if runtime.GOOS == "darwin" {
|
||||
err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
|
||||
if err != nil {
|
||||
if err == payloadMissing {
|
||||
// TODO perhaps consider this a hard failure on arm macs?
|
||||
slog.Info("ggml-meta.metal payload missing")
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
|
||||
}
|
||||
|
||||
libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/*/lib/*")
|
||||
if err != nil {
|
||||
if err == payloadMissing {
|
||||
slog.Info(fmt.Sprintf("%s", payloadMissing))
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
for _, lib := range libs {
|
||||
// The last dir component is the variant name
|
||||
variant := filepath.Base(filepath.Dir(lib))
|
||||
availableDynLibs[variant] = lib
|
||||
}
|
||||
|
||||
if err := verifyDriverAccess(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Report which dynamic libraries we have loaded to assist troubleshooting
|
||||
variants := make([]string, len(availableDynLibs))
|
||||
i := 0
|
||||
for variant := range availableDynLibs {
|
||||
variants[i] = variant
|
||||
i++
|
||||
}
|
||||
slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
|
||||
slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func extractDynamicLibs(workDir, glob string) ([]string, error) {
|
||||
files, err := fs.Glob(libEmbed, glob)
|
||||
if err != nil || len(files) == 0 {
|
||||
return nil, payloadMissing
|
||||
}
|
||||
libs := []string{}
|
||||
|
||||
// TODO consider making this idempotent with some sort of persistent directory (where we store models probably)
|
||||
// and tracking by version so we don't reexpand the files every time
|
||||
// Also maybe consider lazy loading only what is needed
|
||||
|
||||
g := new(errgroup.Group)
|
||||
for _, file := range files {
|
||||
pathComps := strings.Split(file, "/")
|
||||
if len(pathComps) != pathComponentCount {
|
||||
slog.Error(fmt.Sprintf("unexpected payload components: %v", pathComps))
|
||||
continue
|
||||
}
|
||||
|
||||
file := file
|
||||
g.Go(func() error {
|
||||
// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
|
||||
// Include the variant in the path to avoid conflicts between multiple server libs
|
||||
targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
|
||||
srcFile, err := libEmbed.Open(file)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read payload %s: %v", file, err)
|
||||
}
|
||||
defer srcFile.Close()
|
||||
if err := os.MkdirAll(targetDir, 0o755); err != nil {
|
||||
return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
|
||||
}
|
||||
src := io.Reader(srcFile)
|
||||
filename := file
|
||||
if strings.HasSuffix(file, ".gz") {
|
||||
src, err = gzip.NewReader(src)
|
||||
if err != nil {
|
||||
return fmt.Errorf("decompress payload %s: %v", file, err)
|
||||
}
|
||||
filename = strings.TrimSuffix(filename, ".gz")
|
||||
}
|
||||
|
||||
destFile := filepath.Join(targetDir, filepath.Base(filename))
|
||||
if strings.Contains(destFile, "server") {
|
||||
libs = append(libs, destFile)
|
||||
}
|
||||
|
||||
_, err = os.Stat(destFile)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
return fmt.Errorf("write payload %s: %v", file, err)
|
||||
}
|
||||
defer destFile.Close()
|
||||
if _, err := io.Copy(destFile, src); err != nil {
|
||||
return fmt.Errorf("copy payload %s: %v", file, err)
|
||||
}
|
||||
case err != nil:
|
||||
return fmt.Errorf("stat payload %s: %v", file, err)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
return libs, g.Wait()
|
||||
}
|
||||
|
||||
func extractPayloadFiles(workDir, glob string) error {
|
||||
files, err := fs.Glob(libEmbed, glob)
|
||||
if err != nil || len(files) == 0 {
|
||||
return payloadMissing
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
srcFile, err := libEmbed.Open(file)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read payload %s: %v", file, err)
|
||||
}
|
||||
defer srcFile.Close()
|
||||
if err := os.MkdirAll(workDir, 0o755); err != nil {
|
||||
return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
|
||||
}
|
||||
src := io.Reader(srcFile)
|
||||
filename := file
|
||||
if strings.HasSuffix(file, ".gz") {
|
||||
src, err = gzip.NewReader(src)
|
||||
if err != nil {
|
||||
return fmt.Errorf("decompress payload %s: %v", file, err)
|
||||
}
|
||||
filename = strings.TrimSuffix(filename, ".gz")
|
||||
}
|
||||
|
||||
destFile := filepath.Join(workDir, filepath.Base(filename))
|
||||
_, err = os.Stat(destFile)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
return fmt.Errorf("write payload %s: %v", file, err)
|
||||
}
|
||||
defer destFile.Close()
|
||||
if _, err := io.Copy(destFile, src); err != nil {
|
||||
return fmt.Errorf("copy payload %s: %v", file, err)
|
||||
}
|
||||
case err != nil:
|
||||
return fmt.Errorf("stat payload %s: %v", file, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func verifyDriverAccess() error {
|
||||
if runtime.GOOS != "linux" {
|
||||
return nil
|
||||
}
|
||||
// Only check ROCm access if we have the dynamic lib loaded
|
||||
if rocmDynLibPresent() {
|
||||
// Verify we have permissions - either running as root, or we have group access to the driver
|
||||
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
|
||||
if err != nil {
|
||||
if errors.Is(err, fs.ErrPermission) {
|
||||
return fmt.Errorf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.")
|
||||
} else if errors.Is(err, fs.ErrNotExist) {
|
||||
// expected behavior without a radeon card
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
|
||||
}
|
||||
fd.Close()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -1,8 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
|
||||
var libEmbed embed.FS
|
||||
@@ -1,8 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib*
|
||||
var libEmbed embed.FS
|
||||
@@ -1,8 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/build/linux/*/*/lib/*.so*
|
||||
var libEmbed embed.FS
|
||||
@@ -1,58 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/jmorganca/ollama/gpu"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestGetDynLibs(t *testing.T) {
|
||||
availableDynLibs = map[string]string{
|
||||
"cpu": "X_cpu",
|
||||
}
|
||||
assert.Equal(t, false, rocmDynLibPresent())
|
||||
res := getDynLibs(gpu.GpuInfo{Library: "cpu"})
|
||||
assert.Len(t, res, 1)
|
||||
assert.Equal(t, availableDynLibs["cpu"], res[0])
|
||||
|
||||
variant := gpu.GetCPUVariant()
|
||||
if variant != "" {
|
||||
variant = "_" + variant
|
||||
}
|
||||
availableDynLibs = map[string]string{
|
||||
"rocm_v5": "X_rocm_v5",
|
||||
"rocm_v6": "X_rocm_v6",
|
||||
"cpu" + variant: "X_cpu",
|
||||
}
|
||||
assert.Equal(t, true, rocmDynLibPresent())
|
||||
res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
|
||||
assert.Len(t, res, 3)
|
||||
assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
|
||||
assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
|
||||
assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
|
||||
|
||||
res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
|
||||
assert.Len(t, res, 3)
|
||||
assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
|
||||
assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
|
||||
assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
|
||||
|
||||
res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
|
||||
assert.Len(t, res, 1)
|
||||
assert.Equal(t, availableDynLibs["cpu"+variant], res[0])
|
||||
|
||||
res = getDynLibs(gpu.GpuInfo{Library: "default"})
|
||||
assert.Len(t, res, 1)
|
||||
assert.Equal(t, "default", res[0])
|
||||
|
||||
availableDynLibs = map[string]string{
|
||||
"rocm": "X_rocm_v5",
|
||||
"cpu" + variant: "X_cpu",
|
||||
}
|
||||
assert.Equal(t, true, rocmDynLibPresent())
|
||||
res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
|
||||
assert.Len(t, res, 2)
|
||||
assert.Equal(t, availableDynLibs["rocm"], res[0])
|
||||
assert.Equal(t, availableDynLibs["cpu"+variant], res[1])
|
||||
}
|
||||
@@ -1,8 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/build/windows/*/*/lib/*.dll*
|
||||
var libEmbed embed.FS
|
||||
71
llm/shim_darwin.go
Normal file
71
llm/shim_darwin.go
Normal file
@@ -0,0 +1,71 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/ggml-metal.metal
|
||||
var libEmbed embed.FS
|
||||
|
||||
func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
// should never happen...
|
||||
return nil, fmt.Errorf("Dynamic library loading not supported on Mac")
|
||||
}
|
||||
|
||||
func nativeInit(workdir string) error {
|
||||
err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
|
||||
if err != nil {
|
||||
if err == payloadMissing {
|
||||
// TODO perhaps consider this a hard failure on arm macs?
|
||||
log.Printf("ggml-meta.metal payload missing")
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
|
||||
return nil
|
||||
}
|
||||
|
||||
func extractPayloadFiles(workDir, glob string) error {
|
||||
files, err := fs.Glob(libEmbed, glob)
|
||||
if err != nil || len(files) == 0 {
|
||||
return payloadMissing
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
srcFile, err := libEmbed.Open(file)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read payload %s: %v", file, err)
|
||||
}
|
||||
defer srcFile.Close()
|
||||
if err := os.MkdirAll(workDir, 0o755); err != nil {
|
||||
return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
|
||||
}
|
||||
|
||||
destFile := filepath.Join(workDir, filepath.Base(file))
|
||||
_, err = os.Stat(destFile)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
return fmt.Errorf("write payload %s: %v", file, err)
|
||||
}
|
||||
defer destFile.Close()
|
||||
if _, err := io.Copy(destFile, srcFile); err != nil {
|
||||
return fmt.Errorf("copy payload %s: %v", file, err)
|
||||
}
|
||||
case err != nil:
|
||||
return fmt.Errorf("stat payload %s: %v", file, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
193
llm/shim_ext_server.go
Normal file
193
llm/shim_ext_server.go
Normal file
@@ -0,0 +1,193 @@
|
||||
//go:build !darwin
|
||||
|
||||
package llm
|
||||
|
||||
/*
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "dynamic_shim.h"
|
||||
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
type shimExtServer struct {
|
||||
s C.struct_dynamic_llama_server
|
||||
options api.Options
|
||||
}
|
||||
|
||||
// Note: current implementation does not support concurrent instantiations
|
||||
var shimMutex sync.Mutex
|
||||
var llm *shimExtServer
|
||||
|
||||
const pathComponentCount = 6
|
||||
|
||||
func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
|
||||
C.dynamic_shim_llama_server_init(llm.s, sparams, err)
|
||||
}
|
||||
func (llm *shimExtServer) llama_server_start() {
|
||||
C.dynamic_shim_llama_server_start(llm.s)
|
||||
}
|
||||
func (llm *shimExtServer) llama_server_stop() {
|
||||
C.dynamic_shim_llama_server_stop(llm.s)
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
|
||||
C.dynamic_shim_llama_server_completion(llm.s, json_req, resp)
|
||||
}
|
||||
func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
|
||||
C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp)
|
||||
}
|
||||
func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
|
||||
C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err)
|
||||
}
|
||||
func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
|
||||
C.dynamic_shim_llama_server_release_task_result(llm.s, result)
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
||||
C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
|
||||
}
|
||||
func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
||||
C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
|
||||
}
|
||||
func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
||||
C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
|
||||
}
|
||||
func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
|
||||
C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp)
|
||||
}
|
||||
|
||||
func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
shimMutex.Lock()
|
||||
defer shimMutex.Unlock()
|
||||
updatePath(filepath.Dir(library))
|
||||
libPath := C.CString(library)
|
||||
defer C.free(unsafe.Pointer(libPath))
|
||||
resp := newExtServerResp(128)
|
||||
defer freeExtServerResp(resp)
|
||||
var srv C.struct_dynamic_llama_server
|
||||
C.dynamic_shim_init(libPath, &srv, &resp)
|
||||
if resp.id < 0 {
|
||||
return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
|
||||
}
|
||||
llm = &shimExtServer{
|
||||
s: srv,
|
||||
options: opts,
|
||||
}
|
||||
log.Printf("Loading Dynamic Shim llm server: %s", library)
|
||||
return newExtServer(llm, model, adapters, projectors, opts)
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
|
||||
return predict(ctx, llm, pred, fn)
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
|
||||
return encode(llm, ctx, prompt)
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
|
||||
return decode(llm, ctx, tokens)
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
|
||||
return embedding(llm, ctx, input)
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) Close() {
|
||||
close(llm)
|
||||
}
|
||||
|
||||
func nativeInit(workdir string) error {
|
||||
libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*")
|
||||
if err != nil {
|
||||
if err == payloadMissing {
|
||||
log.Printf("%s", payloadMissing)
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
for _, lib := range libs {
|
||||
// The last dir component is the variant name
|
||||
variant := filepath.Base(filepath.Dir(lib))
|
||||
AvailableShims[variant] = lib
|
||||
}
|
||||
|
||||
if err := verifyDriverAccess(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Report which dynamic libraries we have loaded to assist troubleshooting
|
||||
variants := make([]string, len(AvailableShims))
|
||||
i := 0
|
||||
for variant := range AvailableShims {
|
||||
variants[i] = variant
|
||||
i++
|
||||
}
|
||||
log.Printf("Dynamic LLM variants %v", variants)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func extractDynamicLibs(workDir, glob string) ([]string, error) {
|
||||
files, err := fs.Glob(libEmbed, glob)
|
||||
if err != nil || len(files) == 0 {
|
||||
return nil, payloadMissing
|
||||
}
|
||||
libs := []string{}
|
||||
|
||||
for _, file := range files {
|
||||
pathComps := strings.Split(file, "/")
|
||||
if len(pathComps) != pathComponentCount {
|
||||
log.Printf("unexpected payload components: %v", pathComps)
|
||||
continue
|
||||
}
|
||||
// llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY
|
||||
// Include the variant in the path to avoid conflicts between multiple server libs
|
||||
targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
|
||||
srcFile, err := libEmbed.Open(file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read payload %s: %v", file, err)
|
||||
}
|
||||
defer srcFile.Close()
|
||||
if err := os.MkdirAll(targetDir, 0o755); err != nil {
|
||||
return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err)
|
||||
}
|
||||
|
||||
destFile := filepath.Join(targetDir, filepath.Base(file))
|
||||
if strings.Contains(destFile, "server") {
|
||||
libs = append(libs, destFile)
|
||||
}
|
||||
|
||||
_, err = os.Stat(destFile)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("write payload %s: %v", file, err)
|
||||
}
|
||||
defer destFile.Close()
|
||||
if _, err := io.Copy(destFile, srcFile); err != nil {
|
||||
return nil, fmt.Errorf("copy payload %s: %v", file, err)
|
||||
}
|
||||
case err != nil:
|
||||
return nil, fmt.Errorf("stat payload %s: %v", file, err)
|
||||
}
|
||||
}
|
||||
return libs, nil
|
||||
}
|
||||
46
llm/shim_ext_server_linux.go
Normal file
46
llm/shim_ext_server_linux.go
Normal file
@@ -0,0 +1,46 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/build/*/*/lib/*.so
|
||||
var libEmbed embed.FS
|
||||
|
||||
func updatePath(dir string) {
|
||||
pathComponents := strings.Split(os.Getenv("PATH"), ":")
|
||||
for _, comp := range pathComponents {
|
||||
if comp == dir {
|
||||
return
|
||||
}
|
||||
}
|
||||
newPath := strings.Join(append(pathComponents, dir), ":")
|
||||
log.Printf("Updating PATH to %s", newPath)
|
||||
os.Setenv("PATH", newPath)
|
||||
}
|
||||
|
||||
func verifyDriverAccess() error {
|
||||
// Only check ROCm access if we have the dynamic lib loaded
|
||||
if _, rocmPresent := AvailableShims["rocm"]; rocmPresent {
|
||||
// Verify we have permissions - either running as root, or we have group access to the driver
|
||||
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
|
||||
if err != nil {
|
||||
if errors.Is(err, fs.ErrPermission) {
|
||||
return fmt.Errorf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.")
|
||||
} else if errors.Is(err, fs.ErrNotExist) {
|
||||
// expected behavior without a radeon card
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
|
||||
}
|
||||
fd.Close()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
36
llm/shim_ext_server_windows.go
Normal file
36
llm/shim_ext_server_windows.go
Normal file
@@ -0,0 +1,36 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/build/windows/*/lib/*.dll
|
||||
var libEmbed embed.FS
|
||||
|
||||
func updatePath(dir string) {
|
||||
tmpDir := filepath.Dir(dir)
|
||||
pathComponents := strings.Split(os.Getenv("PATH"), ";")
|
||||
i := 0
|
||||
for _, comp := range pathComponents {
|
||||
if strings.EqualFold(comp, dir) {
|
||||
return
|
||||
}
|
||||
// Remove any other prior paths to our temp dir
|
||||
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
|
||||
pathComponents[i] = comp
|
||||
i++
|
||||
}
|
||||
}
|
||||
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
|
||||
log.Printf("Updating PATH to %s", newPath)
|
||||
os.Setenv("PATH", newPath)
|
||||
}
|
||||
|
||||
func verifyDriverAccess() error {
|
||||
// TODO if applicable
|
||||
return nil
|
||||
}
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"log"
|
||||
)
|
||||
|
||||
type Command struct {
|
||||
@@ -59,7 +59,7 @@ func Parse(reader io.Reader) ([]Command, error) {
|
||||
default:
|
||||
if !bytes.HasPrefix(fields[0], []byte("#")) {
|
||||
// log a warning for unknown commands
|
||||
slog.Warn(fmt.Sprintf("Unknown command: %s", fields[0]))
|
||||
log.Printf("WARNING: Unknown command: %s", fields[0])
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -77,7 +77,7 @@ func (p *Progress) Add(key string, state State) {
|
||||
p.states = append(p.states, state)
|
||||
}
|
||||
|
||||
func (p *Progress) render() {
|
||||
func (p *Progress) render() error {
|
||||
p.mu.Lock()
|
||||
defer p.mu.Unlock()
|
||||
|
||||
@@ -101,6 +101,8 @@ func (p *Progress) render() {
|
||||
}
|
||||
|
||||
p.pos = len(p.states)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *Progress) start() {
|
||||
|
||||
@@ -23,7 +23,7 @@ type History struct {
|
||||
func NewHistory() (*History, error) {
|
||||
h := &History{
|
||||
Buf: arraylist.New(),
|
||||
Limit: 100, // resizeme
|
||||
Limit: 100, //resizeme
|
||||
Autosave: true,
|
||||
Enabled: true,
|
||||
}
|
||||
@@ -49,7 +49,7 @@ func (h *History) Init() error {
|
||||
|
||||
h.Filename = path
|
||||
|
||||
f, err := os.OpenFile(path, os.O_CREATE|os.O_RDONLY, 0o600)
|
||||
f, err := os.OpenFile(path, os.O_CREATE|os.O_RDONLY, 0600)
|
||||
if err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
return nil
|
||||
@@ -84,7 +84,7 @@ func (h *History) Add(l []rune) {
|
||||
h.Compact()
|
||||
h.Pos = h.Size()
|
||||
if h.Autosave {
|
||||
_ = h.Save()
|
||||
h.Save()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -132,7 +132,7 @@ func (h *History) Save() error {
|
||||
|
||||
tmpFile := h.Filename + ".tmp"
|
||||
|
||||
f, err := os.OpenFile(tmpFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC|os.O_APPEND, 0o600)
|
||||
f, err := os.OpenFile(tmpFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC|os.O_APPEND, 0666)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -72,7 +72,6 @@ func (i *Instance) Readline() (string, error) {
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// nolint: errcheck
|
||||
defer UnsetRawMode(fd, termios)
|
||||
|
||||
buf, _ := NewBuffer(i.Prompt)
|
||||
|
||||
@@ -11,7 +11,7 @@ func handleCharCtrlZ(fd int, termios *Termios) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
_ = syscall.Kill(0, syscall.SIGSTOP)
|
||||
syscall.Kill(0, syscall.SIGSTOP)
|
||||
|
||||
// on resume...
|
||||
return "", nil
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -e
|
||||
set -eu
|
||||
|
||||
export VERSION=${VERSION:-0.0.0}
|
||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
|
||||
@@ -11,36 +11,21 @@ for TARGETARCH in arm64 amd64; do
|
||||
rm -rf llm/llama.cpp/build
|
||||
GOOS=darwin GOARCH=$TARGETARCH go generate ./...
|
||||
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH
|
||||
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -cover -o dist/ollama-darwin-$TARGETARCH-cov
|
||||
done
|
||||
|
||||
lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
|
||||
rm -f dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
|
||||
if [ -n "$APPLE_IDENTITY" ]; then
|
||||
codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
|
||||
else
|
||||
echo "Skipping code signing - set APPLE_IDENTITY"
|
||||
fi
|
||||
lipo -create -output dist/ollama dist/ollama-darwin-*
|
||||
rm -f dist/ollama-darwin-*
|
||||
codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
|
||||
chmod +x dist/ollama
|
||||
|
||||
# build and optionally sign the mac app
|
||||
# build and sign the mac app
|
||||
npm install --prefix app
|
||||
if [ -n "$APPLE_IDENTITY" ]; then
|
||||
npm run --prefix app make:sign
|
||||
else
|
||||
npm run --prefix app make
|
||||
fi
|
||||
npm run --prefix app make:sign
|
||||
cp app/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip
|
||||
|
||||
# sign the binary and rename it
|
||||
if [ -n "$APPLE_IDENTITY" ]; then
|
||||
codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama
|
||||
else
|
||||
echo "WARNING: Skipping code signing - set APPLE_IDENTITY"
|
||||
fi
|
||||
codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama
|
||||
ditto -c -k --keepParent dist/ollama dist/temp.zip
|
||||
if [ -n "$APPLE_IDENTITY" ]; then
|
||||
xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
|
||||
fi
|
||||
xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
|
||||
mv dist/ollama dist/ollama-darwin
|
||||
rm -f dist/temp.zip
|
||||
|
||||
@@ -5,11 +5,10 @@ set -eu
|
||||
export VERSION=${VERSION:-0.0.0}
|
||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
|
||||
|
||||
BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
|
||||
mkdir -p dist
|
||||
|
||||
for TARGETARCH in ${BUILD_ARCH}; do
|
||||
docker build --platform=linux/$TARGETARCH --build-arg=GOFLAGS --build-arg=CGO_CFLAGS --build-arg=OLLAMA_CUSTOM_CPU_DEFS -f Dockerfile.build -t builder:$TARGETARCH .
|
||||
for TARGETARCH in amd64 arm64; do
|
||||
docker build --platform=linux/$TARGETARCH --build-arg=GOFLAGS --build-arg=CGO_CFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
|
||||
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
|
||||
docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
|
||||
docker rm builder-$TARGETARCH
|
||||
|
||||
@@ -66,7 +66,3 @@ subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'generate', './...
|
||||
print("Building")
|
||||
subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'build', '.'])
|
||||
|
||||
print("Copying built result")
|
||||
subprocess.check_call(['scp', netloc +":"+ path + "/ollama.exe", './dist/'])
|
||||
|
||||
|
||||
|
||||
@@ -231,8 +231,8 @@ if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\
|
||||
case $OS_NAME in
|
||||
centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;;
|
||||
rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
|
||||
fedora) [ $OS_VERSION -lt '37' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '37';;
|
||||
amzn) install_cuda_driver_yum 'fedora' '37' ;;
|
||||
fedora) install_cuda_driver_yum $OS_NAME $OS_VERSION ;;
|
||||
amzn) install_cuda_driver_yum 'fedora' '35' ;;
|
||||
debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
|
||||
ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;
|
||||
*) exit ;;
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
# Script for common Dockerfile dependency installation in redhat linux based images
|
||||
|
||||
set -ex
|
||||
MACHINE=$(uname -m)
|
||||
|
||||
if grep -i "centos" /etc/system-release >/dev/null; then
|
||||
# Centos 7 derivatives have too old of a git version to run our generate script
|
||||
# uninstall and ignore failures
|
||||
yum remove -y git
|
||||
yum -y install epel-release centos-release-scl
|
||||
yum -y install dnf
|
||||
if [ "${MACHINE}" = "x86_64" ]; then
|
||||
yum -y install https://repo.ius.io/ius-release-el7.rpm
|
||||
dnf install -y git236
|
||||
else
|
||||
dnf install -y rh-git227-git
|
||||
ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
|
||||
fi
|
||||
dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++
|
||||
elif grep -i "rocky" /etc/system-release >/dev/null; then
|
||||
dnf install -y git gcc-toolset-10-gcc gcc-toolset-10-gcc-c++
|
||||
else
|
||||
echo "ERROR Unexpected distro"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -n "${CMAKE_VERSION}" ]; then
|
||||
curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
|
||||
fi
|
||||
|
||||
if [ -n "${GOLANG_VERSION}" ]; then
|
||||
if [ "${MACHINE}" = "x86_64" ]; then
|
||||
GO_ARCH="amd64"
|
||||
else
|
||||
GO_ARCH="arm64"
|
||||
fi
|
||||
mkdir -p /usr/local
|
||||
curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-${GO_ARCH}.tar.gz | tar xz -C /usr/local
|
||||
ln -s /usr/local/go/bin/go /usr/local/bin/go
|
||||
ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt
|
||||
fi
|
||||
@@ -10,7 +10,7 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
@@ -86,7 +86,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
|
||||
|
||||
rawKey, err := os.ReadFile(keyPath)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
|
||||
log.Printf("Failed to load private key: %v", err)
|
||||
return "", err
|
||||
}
|
||||
|
||||
@@ -105,7 +105,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect) (string, error) {
|
||||
headers.Set("Authorization", sig)
|
||||
resp, err := makeRequest(ctx, http.MethodGet, redirectURL, headers, nil, nil)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("couldn't get token: %q", err))
|
||||
log.Printf("couldn't get token: %q", err)
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"log"
|
||||
"math"
|
||||
"net/http"
|
||||
"net/url"
|
||||
@@ -98,7 +98,7 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *R
|
||||
|
||||
b.Total, _ = strconv.ParseInt(resp.Header.Get("Content-Length"), 10, 64)
|
||||
|
||||
size := b.Total / numDownloadParts
|
||||
var size = b.Total / numDownloadParts
|
||||
switch {
|
||||
case size < minDownloadPartSize:
|
||||
size = minDownloadPartSize
|
||||
@@ -120,7 +120,7 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *R
|
||||
}
|
||||
}
|
||||
|
||||
slog.Info(fmt.Sprintf("downloading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size)))
|
||||
log.Printf("downloading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size))
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -132,13 +132,13 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *Regis
|
||||
defer blobDownloadManager.Delete(b.Digest)
|
||||
ctx, b.CancelFunc = context.WithCancel(ctx)
|
||||
|
||||
file, err := os.OpenFile(b.Name+"-partial", os.O_CREATE|os.O_RDWR, 0o644)
|
||||
file, err := os.OpenFile(b.Name+"-partial", os.O_CREATE|os.O_RDWR, 0644)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
_ = file.Truncate(b.Total)
|
||||
file.Truncate(b.Total)
|
||||
|
||||
g, inner := errgroup.WithContext(ctx)
|
||||
g.SetLimit(numDownloadParts)
|
||||
@@ -159,7 +159,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *Regis
|
||||
return err
|
||||
case err != nil:
|
||||
sleep := time.Second * time.Duration(math.Pow(2, float64(try)))
|
||||
slog.Info(fmt.Sprintf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep))
|
||||
log.Printf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep)
|
||||
time.Sleep(sleep)
|
||||
continue
|
||||
default:
|
||||
@@ -246,7 +246,7 @@ func (b *blobDownload) readPart(partName string) (*blobDownloadPart, error) {
|
||||
}
|
||||
|
||||
func (b *blobDownload) writePart(partName string, part *blobDownloadPart) error {
|
||||
partFile, err := os.OpenFile(partName, os.O_CREATE|os.O_RDWR|os.O_TRUNC, 0o644)
|
||||
partFile, err := os.OpenFile(partName, os.O_CREATE|os.O_RDWR|os.O_TRUNC, 0644)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -340,7 +340,6 @@ func downloadBlob(ctx context.Context, opts downloadOpts) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// nolint: contextcheck
|
||||
go download.Run(context.Background(), requestURL, opts.regOpts)
|
||||
}
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
@@ -337,7 +336,7 @@ func GetModel(name string) (*Model, error) {
|
||||
case "application/vnd.ollama.image.embed":
|
||||
// Deprecated in versions > 0.1.2
|
||||
// TODO: remove this warning in a future version
|
||||
slog.Info("WARNING: model contains embeddings, but embeddings in modelfiles have been deprecated and will be ignored.")
|
||||
log.Print("WARNING: model contains embeddings, but embeddings in modelfiles have been deprecated and will be ignored.")
|
||||
case "application/vnd.ollama.image.adapter":
|
||||
model.AdapterPaths = append(model.AdapterPaths, filename)
|
||||
case "application/vnd.ollama.image.projector":
|
||||
@@ -428,7 +427,7 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
|
||||
fromParams := make(map[string]any)
|
||||
|
||||
for _, c := range commands {
|
||||
slog.Info(fmt.Sprintf("[%s] - %s", c.Name, c.Args))
|
||||
log.Printf("[%s] - %s", c.Name, c.Args)
|
||||
mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
|
||||
|
||||
switch c.Name {
|
||||
@@ -748,7 +747,6 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{},
|
||||
// save (i.e. delete from the deleteMap) any files used in other manifests
|
||||
manifest, _, err := GetManifest(fmp)
|
||||
if err != nil {
|
||||
// nolint: nilerr
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -768,16 +766,16 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{},
|
||||
for k := range deleteMap {
|
||||
fp, err := GetBlobsPath(k)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("couldn't get file path for '%s': %v", k, err))
|
||||
log.Printf("couldn't get file path for '%s': %v", k, err)
|
||||
continue
|
||||
}
|
||||
if !dryRun {
|
||||
if err := os.Remove(fp); err != nil {
|
||||
slog.Info(fmt.Sprintf("couldn't remove file '%s': %v", fp, err))
|
||||
log.Printf("couldn't remove file '%s': %v", fp, err)
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
slog.Info(fmt.Sprintf("wanted to remove: %s", fp))
|
||||
log.Printf("wanted to remove: %s", fp)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -793,7 +791,7 @@ func PruneLayers() error {
|
||||
|
||||
blobs, err := os.ReadDir(p)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("couldn't read dir '%s': %v", p, err))
|
||||
log.Printf("couldn't read dir '%s': %v", p, err)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -807,14 +805,14 @@ func PruneLayers() error {
|
||||
}
|
||||
}
|
||||
|
||||
slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))
|
||||
log.Printf("total blobs: %d", len(deleteMap))
|
||||
|
||||
err = deleteUnusedLayers(nil, deleteMap, false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
slog.Info(fmt.Sprintf("total unused blobs removed: %d", len(deleteMap)))
|
||||
log.Printf("total unused blobs removed: %d", len(deleteMap))
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -876,7 +874,7 @@ func DeleteModel(name string) error {
|
||||
}
|
||||
err = os.Remove(fp)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("couldn't remove manifest file '%s': %v", fp, err))
|
||||
log.Printf("couldn't remove manifest file '%s': %v", fp, err)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -930,14 +928,14 @@ PARAMETER {{ $k }} {{ printf "%#v" $parameter }}
|
||||
|
||||
tmpl, err := template.New("").Parse(modelFile)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("error parsing template: %q", err))
|
||||
log.Printf("error parsing template: %q", err)
|
||||
return "", err
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
|
||||
if err = tmpl.Execute(&buf, mt); err != nil {
|
||||
slog.Info(fmt.Sprintf("error executing template: %q", err))
|
||||
log.Printf("error executing template: %q", err)
|
||||
return "", err
|
||||
}
|
||||
|
||||
@@ -964,7 +962,7 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
|
||||
|
||||
for _, layer := range layers {
|
||||
if err := uploadBlob(ctx, mp, layer, regOpts, fn); err != nil {
|
||||
slog.Info(fmt.Sprintf("error uploading blob: %v", err))
|
||||
log.Printf("error uploading blob: %v", err)
|
||||
if errors.Is(err, errUnauthorized) {
|
||||
return fmt.Errorf("unable to push %s, make sure this namespace exists and you are authorized to push to it", ParseModelPath(name).GetNamespaceRepository())
|
||||
}
|
||||
@@ -1059,7 +1057,7 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
|
||||
}
|
||||
if err := os.Remove(fp); err != nil {
|
||||
// log this, but return the original error
|
||||
slog.Info(fmt.Sprintf("couldn't remove file with digest mismatch '%s': %v", fp, err))
|
||||
log.Printf("couldn't remove file with digest mismatch '%s': %v", fp, err)
|
||||
}
|
||||
}
|
||||
return err
|
||||
@@ -1083,7 +1081,7 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
|
||||
|
||||
err = os.WriteFile(fp, manifestJSON, 0o644)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("couldn't write to %s", fp))
|
||||
log.Printf("couldn't write to %s", fp)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -1133,46 +1131,49 @@ func GetSHA256Digest(r io.Reader) (string, int64) {
|
||||
var errUnauthorized = fmt.Errorf("unauthorized")
|
||||
|
||||
func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.ReadSeeker, regOpts *RegistryOptions) (*http.Response, error) {
|
||||
for i := 0; i < 2; i++ {
|
||||
resp, err := makeRequest(ctx, method, requestURL, headers, body, regOpts)
|
||||
if err != nil {
|
||||
if !errors.Is(err, context.Canceled) {
|
||||
slog.Info(fmt.Sprintf("request failed: %v", err))
|
||||
}
|
||||
|
||||
return nil, err
|
||||
resp, err := makeRequest(ctx, method, requestURL, headers, body, regOpts)
|
||||
if err != nil {
|
||||
if !errors.Is(err, context.Canceled) {
|
||||
log.Printf("request failed: %v", err)
|
||||
}
|
||||
|
||||
switch {
|
||||
case resp.StatusCode == http.StatusUnauthorized:
|
||||
// Handle authentication error with one retry
|
||||
auth := resp.Header.Get("www-authenticate")
|
||||
authRedir := ParseAuthRedirectString(auth)
|
||||
token, err := getAuthToken(ctx, authRedir)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
switch {
|
||||
case resp.StatusCode == http.StatusUnauthorized:
|
||||
// Handle authentication error with one retry
|
||||
auth := resp.Header.Get("www-authenticate")
|
||||
authRedir := ParseAuthRedirectString(auth)
|
||||
token, err := getAuthToken(ctx, authRedir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
regOpts.Token = token
|
||||
if body != nil {
|
||||
_, err = body.Seek(0, io.SeekStart)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
regOpts.Token = token
|
||||
if body != nil {
|
||||
_, err = body.Seek(0, io.SeekStart)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
case resp.StatusCode == http.StatusNotFound:
|
||||
return nil, os.ErrNotExist
|
||||
case resp.StatusCode >= http.StatusBadRequest:
|
||||
responseBody, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%d: %s", resp.StatusCode, err)
|
||||
}
|
||||
return nil, fmt.Errorf("%d: %s", resp.StatusCode, responseBody)
|
||||
default:
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
resp, err := makeRequest(ctx, method, requestURL, headers, body, regOpts)
|
||||
if resp.StatusCode == http.StatusUnauthorized {
|
||||
return nil, errUnauthorized
|
||||
}
|
||||
|
||||
return resp, err
|
||||
case resp.StatusCode == http.StatusNotFound:
|
||||
return nil, os.ErrNotExist
|
||||
case resp.StatusCode >= http.StatusBadRequest:
|
||||
responseBody, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%d: %s", resp.StatusCode, err)
|
||||
}
|
||||
return nil, fmt.Errorf("%d: %s", resp.StatusCode, responseBody)
|
||||
}
|
||||
|
||||
return nil, errUnauthorized
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *RegistryOptions) (*http.Response, error) {
|
||||
|
||||
@@ -26,9 +26,9 @@ func WriteManifest(name string, config *Layer, layers []*Layer) error {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(filepath.Dir(manifestPath), 0o755); err != nil {
|
||||
if err := os.MkdirAll(filepath.Dir(manifestPath), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return os.WriteFile(manifestPath, b.Bytes(), 0o644)
|
||||
return os.WriteFile(manifestPath, b.Bytes(), 0644)
|
||||
}
|
||||
|
||||
@@ -46,8 +46,7 @@ func ParseModelPath(name string) ModelPath {
|
||||
name = after
|
||||
}
|
||||
|
||||
name = strings.ReplaceAll(name, string(os.PathSeparator), "/")
|
||||
parts := strings.Split(name, "/")
|
||||
parts := strings.Split(name, string(os.PathSeparator))
|
||||
switch len(parts) {
|
||||
case 3:
|
||||
mp.Registry = parts[0]
|
||||
|
||||
125
server/routes.go
125
server/routes.go
@@ -7,7 +7,7 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"log/slog"
|
||||
"log"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
@@ -73,7 +74,7 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D
|
||||
|
||||
if needLoad {
|
||||
if loaded.runner != nil {
|
||||
slog.Info("changing loaded model")
|
||||
log.Println("changing loaded model")
|
||||
loaded.runner.Close()
|
||||
loaded.runner = nil
|
||||
loaded.Model = nil
|
||||
@@ -197,8 +198,7 @@ func GenerateHandler(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, api.GenerateResponse{
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Model: req.Model,
|
||||
Done: true,
|
||||
})
|
||||
Done: true})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -391,7 +391,7 @@ func EmbeddingHandler(c *gin.Context) {
|
||||
|
||||
embedding, err := loaded.runner.Embedding(c.Request.Context(), req.Prompt)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
|
||||
log.Printf("embedding generation failed: %v", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
|
||||
return
|
||||
}
|
||||
@@ -414,13 +414,8 @@ func PullModelHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
var model string
|
||||
if req.Model != "" {
|
||||
model = req.Model
|
||||
} else if req.Name != "" {
|
||||
model = req.Name
|
||||
} else {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
|
||||
if req.Name == "" {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "name is required"})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -438,7 +433,7 @@ func PullModelHandler(c *gin.Context) {
|
||||
ctx, cancel := context.WithCancel(c.Request.Context())
|
||||
defer cancel()
|
||||
|
||||
if err := PullModel(ctx, model, regOpts, fn); err != nil {
|
||||
if err := PullModel(ctx, req.Name, regOpts, fn); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
}
|
||||
}()
|
||||
@@ -463,13 +458,8 @@ func PushModelHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
var model string
|
||||
if req.Model != "" {
|
||||
model = req.Model
|
||||
} else if req.Name != "" {
|
||||
model = req.Name
|
||||
} else {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
|
||||
if req.Name == "" {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "name is required"})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -487,7 +477,7 @@ func PushModelHandler(c *gin.Context) {
|
||||
ctx, cancel := context.WithCancel(c.Request.Context())
|
||||
defer cancel()
|
||||
|
||||
if err := PushModel(ctx, model, regOpts, fn); err != nil {
|
||||
if err := PushModel(ctx, req.Name, regOpts, fn); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
}
|
||||
}()
|
||||
@@ -512,17 +502,12 @@ func CreateModelHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
var model string
|
||||
if req.Model != "" {
|
||||
model = req.Model
|
||||
} else if req.Name != "" {
|
||||
model = req.Name
|
||||
} else {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
|
||||
if req.Name == "" {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "name is required"})
|
||||
return
|
||||
}
|
||||
|
||||
if err := ParseModelPath(model).Validate(); err != nil {
|
||||
if err := ParseModelPath(req.Name).Validate(); err != nil {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
@@ -560,7 +545,7 @@ func CreateModelHandler(c *gin.Context) {
|
||||
ctx, cancel := context.WithCancel(c.Request.Context())
|
||||
defer cancel()
|
||||
|
||||
if err := CreateModel(ctx, model, filepath.Dir(req.Path), commands, fn); err != nil {
|
||||
if err := CreateModel(ctx, req.Name, filepath.Dir(req.Path), commands, fn); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
}
|
||||
}()
|
||||
@@ -585,19 +570,14 @@ func DeleteModelHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
var model string
|
||||
if req.Model != "" {
|
||||
model = req.Model
|
||||
} else if req.Name != "" {
|
||||
model = req.Name
|
||||
} else {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
|
||||
if req.Name == "" {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "name is required"})
|
||||
return
|
||||
}
|
||||
|
||||
if err := DeleteModel(model); err != nil {
|
||||
if err := DeleteModel(req.Name); err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", model)})
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Name)})
|
||||
} else {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
}
|
||||
@@ -630,19 +610,21 @@ func ShowModelHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
if req.Model != "" {
|
||||
// noop
|
||||
} else if req.Name != "" {
|
||||
req.Model = req.Name
|
||||
} else {
|
||||
switch {
|
||||
case req.Model == "" && req.Name == "":
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
|
||||
return
|
||||
case req.Model != "" && req.Name != "":
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "both model and name are set"})
|
||||
return
|
||||
case req.Model == "" && req.Name != "":
|
||||
req.Model = req.Name
|
||||
}
|
||||
|
||||
resp, err := GetModelInfo(req)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Name)})
|
||||
} else {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
}
|
||||
@@ -685,12 +667,27 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
||||
cs := 30
|
||||
for k, v := range model.Options {
|
||||
switch val := v.(type) {
|
||||
case string:
|
||||
params = append(params, fmt.Sprintf("%-*s %s", cs, k, val))
|
||||
case int:
|
||||
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.Itoa(val)))
|
||||
case float64:
|
||||
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatFloat(val, 'f', 0, 64)))
|
||||
case bool:
|
||||
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatBool(val)))
|
||||
case []interface{}:
|
||||
for _, nv := range val {
|
||||
params = append(params, fmt.Sprintf("%-*s %#v", cs, k, nv))
|
||||
switch nval := nv.(type) {
|
||||
case string:
|
||||
params = append(params, fmt.Sprintf("%-*s %s", cs, k, nval))
|
||||
case int:
|
||||
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.Itoa(nval)))
|
||||
case float64:
|
||||
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatFloat(nval, 'f', 0, 64)))
|
||||
case bool:
|
||||
params = append(params, fmt.Sprintf("%-*s %s", cs, k, strconv.FormatBool(nval)))
|
||||
}
|
||||
}
|
||||
default:
|
||||
params = append(params, fmt.Sprintf("%-*s %#v", cs, k, v))
|
||||
}
|
||||
}
|
||||
resp.Parameters = strings.Join(params, "\n")
|
||||
@@ -713,7 +710,7 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
||||
|
||||
func ListModelsHandler(c *gin.Context) {
|
||||
models := make([]api.ModelResponse, 0)
|
||||
manifestsPath, err := GetManifestPath()
|
||||
fp, err := GetManifestPath()
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
@@ -734,7 +731,6 @@ func ListModelsHandler(c *gin.Context) {
|
||||
}
|
||||
|
||||
return api.ModelResponse{
|
||||
Model: model.ShortName,
|
||||
Name: model.ShortName,
|
||||
Size: model.Size,
|
||||
Digest: model.Digest,
|
||||
@@ -744,15 +740,13 @@ func ListModelsHandler(c *gin.Context) {
|
||||
|
||||
walkFunc := func(path string, info os.FileInfo, _ error) error {
|
||||
if !info.IsDir() {
|
||||
path, tag := filepath.Split(path)
|
||||
model := strings.Trim(strings.TrimPrefix(path, manifestsPath), string(os.PathSeparator))
|
||||
modelPath := strings.Join([]string{model, tag}, ":")
|
||||
canonicalModelPath := strings.ReplaceAll(modelPath, string(os.PathSeparator), "/")
|
||||
dir, file := filepath.Split(path)
|
||||
dir = strings.Trim(strings.TrimPrefix(dir, fp), string(os.PathSeparator))
|
||||
tag := strings.Join([]string{dir, file}, ":")
|
||||
|
||||
resp, err := modelResponse(canonicalModelPath)
|
||||
resp, err := modelResponse(tag)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("skipping file: %s", canonicalModelPath))
|
||||
// nolint: nilerr
|
||||
log.Printf("skipping file: %s", fp)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -763,7 +757,7 @@ func ListModelsHandler(c *gin.Context) {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := filepath.Walk(manifestsPath, walkFunc); err != nil {
|
||||
if err := filepath.Walk(fp, walkFunc); err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
@@ -911,13 +905,6 @@ func (s *Server) GenerateRoutes() http.Handler {
|
||||
}
|
||||
|
||||
func Serve(ln net.Listener) error {
|
||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
||||
var programLevel = new(slog.LevelVar)
|
||||
h := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: programLevel, AddSource: true})
|
||||
slog.SetDefault(slog.New(h))
|
||||
programLevel.Set(slog.LevelDebug)
|
||||
slog.Debug("Debug logging enabled")
|
||||
}
|
||||
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
||||
// clean up unused layers and manifests
|
||||
if err := PruneLayers(); err != nil {
|
||||
@@ -940,7 +927,7 @@ func Serve(ln net.Listener) error {
|
||||
}
|
||||
r := s.GenerateRoutes()
|
||||
|
||||
slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
|
||||
log.Printf("Listening on %s (version %s)", ln.Addr(), version.Version)
|
||||
srvr := &http.Server{
|
||||
Handler: r,
|
||||
}
|
||||
@@ -963,7 +950,7 @@ func Serve(ln net.Listener) error {
|
||||
if runtime.GOOS == "linux" { // TODO - windows too
|
||||
// check compatibility to log warnings
|
||||
if _, err := gpu.CheckVRAM(); err != nil {
|
||||
slog.Info(err.Error())
|
||||
log.Print(err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1005,14 +992,14 @@ func streamResponse(c *gin.Context, ch chan any) {
|
||||
|
||||
bts, err := json.Marshal(val)
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("streamResponse: json.Marshal failed with %s", err))
|
||||
log.Printf("streamResponse: json.Marshal failed with %s", err)
|
||||
return false
|
||||
}
|
||||
|
||||
// Delineate chunks with new-line delimiter
|
||||
bts = append(bts, '\n')
|
||||
if _, err := w.Write(bts); err != nil {
|
||||
slog.Info(fmt.Sprintf("streamResponse: w.Write failed with %s", err))
|
||||
log.Printf("streamResponse: w.Write failed with %s", err)
|
||||
return false
|
||||
}
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@ import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
@@ -51,7 +50,7 @@ func Test_Routes(t *testing.T) {
|
||||
createTestModel := func(t *testing.T, name string) {
|
||||
fname := createTestFile(t, "ollama-model")
|
||||
|
||||
modelfile := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
|
||||
modelfile := strings.NewReader(fmt.Sprintf("FROM %s", fname))
|
||||
commands, err := parser.Parse(modelfile)
|
||||
assert.Nil(t, err)
|
||||
fn := func(resp api.ProgressResponse) {
|
||||
@@ -168,42 +167,6 @@ func Test_Routes(t *testing.T) {
|
||||
assert.Equal(t, "beefsteak:latest", model.ShortName)
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "Show Model Handler",
|
||||
Method: http.MethodPost,
|
||||
Path: "/api/show",
|
||||
Setup: func(t *testing.T, req *http.Request) {
|
||||
createTestModel(t, "show-model")
|
||||
showReq := api.ShowRequest{Model: "show-model"}
|
||||
jsonData, err := json.Marshal(showReq)
|
||||
assert.Nil(t, err)
|
||||
req.Body = io.NopCloser(bytes.NewReader(jsonData))
|
||||
},
|
||||
Expected: func(t *testing.T, resp *http.Response) {
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
assert.Equal(t, contentType, "application/json; charset=utf-8")
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
assert.Nil(t, err)
|
||||
|
||||
var showResp api.ShowResponse
|
||||
err = json.Unmarshal(body, &showResp)
|
||||
assert.Nil(t, err)
|
||||
|
||||
var params []string
|
||||
paramsSplit := strings.Split(showResp.Parameters, "\n")
|
||||
for _, p := range paramsSplit {
|
||||
params = append(params, strings.Join(strings.Fields(p), " "))
|
||||
}
|
||||
sort.Strings(params)
|
||||
expectedParams := []string{
|
||||
"seed 42",
|
||||
"stop \"bar\"",
|
||||
"stop \"foo\"",
|
||||
"top_p 0.9",
|
||||
}
|
||||
assert.Equal(t, expectedParams, params)
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
s, err := setupServer(t)
|
||||
@@ -230,12 +193,13 @@ func Test_Routes(t *testing.T) {
|
||||
}
|
||||
|
||||
resp, err := httpSrv.Client().Do(req)
|
||||
assert.Nil(t, err)
|
||||
defer resp.Body.Close()
|
||||
assert.Nil(t, err)
|
||||
|
||||
if tc.Expected != nil {
|
||||
tc.Expected(t, resp)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ import (
|
||||
"fmt"
|
||||
"hash"
|
||||
"io"
|
||||
"log/slog"
|
||||
"log"
|
||||
"math"
|
||||
"net/http"
|
||||
"net/url"
|
||||
@@ -88,7 +88,7 @@ func (b *blobUpload) Prepare(ctx context.Context, requestURL *url.URL, opts *Reg
|
||||
return nil
|
||||
}
|
||||
|
||||
size := b.Total / numUploadParts
|
||||
var size = b.Total / numUploadParts
|
||||
switch {
|
||||
case size < minUploadPartSize:
|
||||
size = minUploadPartSize
|
||||
@@ -107,7 +107,7 @@ func (b *blobUpload) Prepare(ctx context.Context, requestURL *url.URL, opts *Reg
|
||||
offset += size
|
||||
}
|
||||
|
||||
slog.Info(fmt.Sprintf("uploading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size)))
|
||||
log.Printf("uploading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size))
|
||||
|
||||
requestURL, err = url.Parse(location)
|
||||
if err != nil {
|
||||
@@ -156,7 +156,7 @@ func (b *blobUpload) Run(ctx context.Context, opts *RegistryOptions) {
|
||||
return err
|
||||
case err != nil:
|
||||
sleep := time.Second * time.Duration(math.Pow(2, float64(try)))
|
||||
slog.Info(fmt.Sprintf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep))
|
||||
log.Printf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep)
|
||||
time.Sleep(sleep)
|
||||
continue
|
||||
}
|
||||
@@ -200,7 +200,7 @@ func (b *blobUpload) Run(ctx context.Context, opts *RegistryOptions) {
|
||||
break
|
||||
} else if err != nil {
|
||||
sleep := time.Second * time.Duration(math.Pow(2, float64(try)))
|
||||
slog.Info(fmt.Sprintf("%s complete upload attempt %d failed: %v, retrying in %s", b.Digest[7:19], try, err, sleep))
|
||||
log.Printf("%s complete upload attempt %d failed: %v, retrying in %s", b.Digest[7:19], try, err, sleep)
|
||||
time.Sleep(sleep)
|
||||
continue
|
||||
}
|
||||
@@ -265,7 +265,7 @@ func (b *blobUpload) uploadPart(ctx context.Context, method string, requestURL *
|
||||
return err
|
||||
case err != nil:
|
||||
sleep := time.Second * time.Duration(math.Pow(2, float64(try)))
|
||||
slog.Info(fmt.Sprintf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep))
|
||||
log.Printf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep)
|
||||
time.Sleep(sleep)
|
||||
continue
|
||||
}
|
||||
@@ -395,7 +395,6 @@ func uploadBlob(ctx context.Context, mp ModelPath, layer *Layer, opts *RegistryO
|
||||
return err
|
||||
}
|
||||
|
||||
// nolint: contextcheck
|
||||
go upload.Run(context.Background(), opts)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user