Compare commits

...

35 Commits

Author SHA1 Message Date
jmorganca
0fd4199b39 update llama.cpp submodule to 8960fe8 2024-04-22 08:45:08 -04:00
Cheng
62be2050dd chore: use errors.New to replace fmt.Errorf will much better (#3789) 2024-04-20 22:11:06 -04:00
Blake Mizerany
56f8aa6912 types/model: export IsValidNamePart (#3788) 2024-04-20 18:26:34 -07:00
Sri Siddhaarth
e6f9bfc0e8 Update api.md (#3705) 2024-04-20 15:17:03 -04:00
Daniel Hiltgen
8d1995c625 Merge pull request #3708 from remy415/arm64static
move Ollama static build to its own flag
2024-04-18 16:04:12 -07:00
Daniel Hiltgen
fd01fbf038 Merge pull request #3710 from remy415/update-jetson-docs
update jetson tutorial
2024-04-18 16:02:08 -07:00
Blake Mizerany
0408205c1c types/model: accept former : as a separator in digest (#3724)
This also converges the old sep `:` to the new sep `-`.
2024-04-18 14:17:46 -07:00
Jeffrey Morgan
63a7edd771 Update README.md 2024-04-18 16:09:38 -04:00
Michael
554ffdcce3 add llama3 to readme
add llama3 to readme
2024-04-18 15:18:48 -04:00
Jeremy
9850a4ce08 Merge branch 'ollama:main' into update-jetson-docs 2024-04-18 09:55:17 -04:00
Jeremy
fd048f1367 Merge branch 'ollama:main' into arm64static 2024-04-18 09:55:04 -04:00
Michael Yang
8645076a71 Merge pull request #3712 from ollama/mxyng/mem
add stablelm graph calculation
2024-04-17 15:57:51 -07:00
Michael Yang
05e9424824 Merge pull request #3664 from ollama/mxyng/fix-padding-2
fix padding to only return padding
2024-04-17 15:57:40 -07:00
Michael Yang
52ebe67a98 Merge pull request #3714 from ollama/mxyng/model-name-host
types/model: support : in PartHost for host:port
2024-04-17 15:34:03 -07:00
Michael Yang
889b31ab78 types/model: support : in PartHost for host:port 2024-04-17 15:16:07 -07:00
Michael Yang
3cf483fe48 add stablelm graph calculation 2024-04-17 13:57:19 -07:00
Jeremy
8dca03173d Merge remote-tracking branch 'upstream/main' into update-jetson-docs 2024-04-17 16:18:50 -04:00
Jeremy
85bdf14b56 update jetson tutorial 2024-04-17 16:17:42 -04:00
Jeremy
da8a0c7657 Merge branch 'ollama:main' into arm64static 2024-04-17 15:22:34 -04:00
jmorganca
c8afe7168c use correct extension for feature and model request issue templates 2024-04-17 15:18:40 -04:00
jmorganca
28d3cd0148 simpler feature and model request forms 2024-04-17 15:17:08 -04:00
jmorganca
eb5554232a simpler feature and model request forms 2024-04-17 15:14:49 -04:00
Jeremy
ea4c284a48 Merge branch 'ollama:main' into arm64static 2024-04-17 15:11:38 -04:00
jmorganca
2bdc320216 add descriptions to issue templates 2024-04-17 15:08:36 -04:00
jmorganca
32561aed09 simplify github issue templates a bit 2024-04-17 15:07:03 -04:00
Michael Yang
71548d9829 Merge pull request #3706 from ollama/mxyng/mem
account for all non-repeating layers
2024-04-17 11:58:20 -07:00
Jeremy
8aec92fa6d rearranged conditional logic for static build, dockerfile updated 2024-04-17 14:43:28 -04:00
Michael Yang
a8b9b930b4 account for all non-repeating layers 2024-04-17 11:21:21 -07:00
Michael
9755cf9173 acknowledge the amazing work done by Georgi and team! 2024-04-17 13:48:14 -04:00
Jeremy
70261b9bb6 move static build to its own flag 2024-04-17 13:04:28 -04:00
Blake Mizerany
9df6c85c3a types/model: add FilepathNoBuild (#3680)
Also, add test for DisplayLongest.

Also, plumb fill param to ParseName in MustParseName
2024-04-16 18:35:43 -07:00
Michael Yang
e74163af4c fix padding to only return padding 2024-04-16 15:43:26 -07:00
Michael Yang
fb9580df85 Merge pull request #3684 from ollama/mxyng/scale-graph
scale graph based on gpu count
2024-04-16 14:57:09 -07:00
Michael Yang
26df674785 scale graph based on gpu count 2024-04-16 14:44:13 -07:00
Jeffrey Morgan
7c9792a6e0 Support unicode characters in model path (#3681)
* parse wide argv characters on windows

* cleanup

* move cleanup to end of `main`
2024-04-16 17:00:12 -04:00
23 changed files with 292 additions and 297 deletions

View File

@@ -0,0 +1,60 @@
name: Bug report
labels: [bug]
description: Something isn't working right.
body:
- type: textarea
id: description
attributes:
label: What is the issue?
description: What happened? What did you expect to happen?
validations:
required: true
- type: dropdown
id: os
attributes:
label: OS
description: Which operating system are you using?
multiple: true
options:
- Linux
- macOS
- Windows
- Docker
- WSL2
validations:
required: false
- type: dropdown
id: gpu
attributes:
label: GPU
description: Which GPU are you using?
multiple: true
options:
- Nvidia
- AMD
- Intel
- Apple
- Other
validations:
required: false
- type: dropdown
id: cpu
attributes:
label: CPU
description: Which CPU are you using?
multiple: true
options:
- Intel
- AMD
- Apple
- Other
validations:
required: false
- type: input
id: version
attributes:
label: Ollama version
description: What version of Ollama are you using? (`ollama --version`)
placeholder: e.g., 0.1.32
validations:
required: false

View File

@@ -1,18 +0,0 @@
name: Model request
description: Request a new model for the library
labels: [mr]
body:
- type: markdown
attributes:
value: |
Please check if your Model request is [already available](https://ollama.com/search) or that you cannot [import it](https://github.com/ollama/ollama/blob/main/docs/import.md#import-a-model) yourself.
Tell us about which Model you'd like to see in the library!
- type: textarea
id: problem
attributes:
label: What model would you like?
description: Please provide a link to the model.
- type: markdown
attributes:
value: |
Thanks for filing a model request!

View File

@@ -0,0 +1,6 @@
---
name: Feature request
about: Request a new feature
labels: feature request
---

View File

@@ -1,41 +0,0 @@
name: Feature request
description: Propose a new feature
labels: [needs-triage, fr]
body:
- type: markdown
attributes:
value: |
Please check if your feature request is [already filed](https://github.com/ollama/ollama/issues).
Tell us about your idea!
- type: textarea
id: problem
attributes:
label: What are you trying to do?
description: Tell us about the problem you're trying to solve.
validations:
required: false
- type: textarea
id: solution
attributes:
label: How should we solve this?
description: If you have an idea of how you'd like to see this feature work, let us know.
validations:
required: false
- type: textarea
id: alternative
attributes:
label: What is the impact of not solving this?
description: (How) Are you currently working around the issue?
validations:
required: false
- type: textarea
id: context
attributes:
label: Anything else?
description: Any additional context to share, e.g., links
validations:
required: false
- type: markdown
attributes:
value: |
Thanks for filing a feature request!

View File

@@ -0,0 +1,5 @@
---
name: Model request
about: Request support for a new model to be added to Ollama
labels: model request
---

View File

@@ -1,125 +0,0 @@
name: Bug report
description: File a bug report. If you need help, please join our Discord server.
labels: [needs-triage, bug]
body:
- type: markdown
attributes:
value: |
Please check if your bug is [already filed](https://github.com/ollama/ollama/issues) before filing a new one.
- type: textarea
id: what-happened
attributes:
label: What is the issue?
description: What happened? What did you expect to happen?
validations:
required: true
- type: textarea
id: what-was-expected
attributes:
label: What did you expect to see?
description: What did you expect to see/happen instead?
validations:
required: false
- type: textarea
id: steps
attributes:
label: Steps to reproduce
description: What are the steps you took that hit this issue?
validations:
required: false
- type: textarea
id: changes
attributes:
label: Are there any recent changes that introduced the issue?
description: If so, what are those changes?
validations:
required: false
- type: dropdown
id: os
attributes:
label: OS
description: What OS are you using? You may select more than one.
multiple: true
options:
- Linux
- macOS
- Windows
- Other
validations:
required: false
- type: dropdown
id: architecture
attributes:
label: Architecture
description: What architecture are you using? You may select more than one.
multiple: true
options:
- arm64
- amd64
- x86
- Other
- type: dropdown
id: platform
attributes:
label: Platform
description: What platform are you using? You may select more than one.
multiple: true
options:
- Docker
- WSL
- WSL2
validations:
required: false
- type: input
id: ollama-version
attributes:
label: Ollama version
description: What Ollama version are you using? (`ollama --version`)
placeholder: e.g., 1.14.4
validations:
required: false
- type: dropdown
id: gpu
attributes:
label: GPU
description: What GPU, if any, are you using? You may select more than one.
multiple: true
options:
- Nvidia
- AMD
- Intel
- Apple
- Other
validations:
required: false
- type: textarea
id: gpu-info
attributes:
label: GPU info
description: What GPU info do you have? (`nvidia-smi`, `rocminfo`, `system_profiler SPDisplaysDataType`, etc.)
validations:
required: false
- type: dropdown
id: cpu
attributes:
label: CPU
description: What CPU are you using? You may select more than one.
multiple: true
options:
- Intel
- AMD
- Apple
- Other
validations:
required: false
- type: textarea
id: other-software
attributes:
label: Other software
description: What other software are you using that might be related to this issue?
validations:
required: false
- type: markdown
attributes:
value: |
Thanks for filing a bug report!

View File

@@ -18,7 +18,7 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
ARG CMAKE_VERSION
@@ -28,7 +28,7 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
ARG CMAKE_VERSION
@@ -40,7 +40,7 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
ARG CGO_CFLAGS
ARG AMDGPU_TARGETS
RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
RUN mkdir /tmp/scratch && \
for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \
cp ${dep} /tmp/scratch/ || exit 1 ; \
@@ -64,11 +64,11 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate
FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
FROM --platform=linux/arm64 centos:7 AS cpu-builder-arm64
ARG CMAKE_VERSION
@@ -84,7 +84,7 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate
FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
# Intermediate stage used for ./scripts/build_linux.sh

View File

@@ -35,10 +35,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
## Quickstart
To run and chat with [Llama 2](https://ollama.com/library/llama2):
To run and chat with [Llama 3](https://ollama.com/library/llama3):
```
ollama run llama2
ollama run llama3
```
## Model library
@@ -49,7 +49,8 @@ Here are some example models that can be downloaded:
| Model | Parameters | Size | Download |
| ------------------ | ---------- | ----- | ------------------------------ |
| Llama 2 | 7B | 3.8GB | `ollama run llama2` |
| Llama 3 | 8B | 4.7GB | `ollama run llama3` |
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
| Mistral | 7B | 4.1GB | `ollama run mistral` |
| Dolphin Phi | 2.7B | 1.6GB | `ollama run dolphin-phi` |
| Phi-2 | 2.7B | 1.7GB | `ollama run phi` |
@@ -60,7 +61,6 @@ Here are some example models that can be downloaded:
| Llama 2 13B | 13B | 7.3GB | `ollama run llama2:13b` |
| Llama 2 70B | 70B | 39GB | `ollama run llama2:70b` |
| Orca Mini | 3B | 1.9GB | `ollama run orca-mini` |
| Vicuna | 7B | 3.8GB | `ollama run vicuna` |
| LLaVA | 7B | 4.5GB | `ollama run llava` |
| Gemma | 2B | 1.4GB | `ollama run gemma:2b` |
| Gemma | 7B | 4.8GB | `ollama run gemma:7b` |
@@ -98,16 +98,16 @@ See the [guide](docs/import.md) on importing models for more information.
### Customize a prompt
Models from the Ollama library can be customized with a prompt. For example, to customize the `llama2` model:
Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3` model:
```
ollama pull llama2
ollama pull llama3
```
Create a `Modelfile`:
```
FROM llama2
FROM llama3
# set the temperature to 1 [higher is more creative, lower is more coherent]
PARAMETER temperature 1
@@ -142,7 +142,7 @@ ollama create mymodel -f ./Modelfile
### Pull a model
```
ollama pull llama2
ollama pull llama3
```
> This command can also be used to update a local model. Only the diff will be pulled.
@@ -150,13 +150,13 @@ ollama pull llama2
### Remove a model
```
ollama rm llama2
ollama rm llama3
```
### Copy a model
```
ollama cp llama2 my-llama2
ollama cp llama3 my-model
```
### Multiline input
@@ -180,7 +180,7 @@ The image features a yellow smiley face, which is likely the central focus of th
### Pass in prompt as arguments
```
$ ollama run llama2 "Summarize this file: $(cat README.md)"
$ ollama run llama3 "Summarize this file: $(cat README.md)"
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
```
@@ -227,7 +227,7 @@ Next, start the server:
Finally, in a separate shell, run a model:
```
./ollama run llama2
./ollama run llama3
```
## REST API
@@ -238,7 +238,7 @@ Ollama has a REST API for running and managing models.
```
curl http://localhost:11434/api/generate -d '{
"model": "llama2",
"model": "llama3",
"prompt":"Why is the sky blue?"
}'
```
@@ -247,7 +247,7 @@ curl http://localhost:11434/api/generate -d '{
```
curl http://localhost:11434/api/chat -d '{
"model": "mistral",
"model": "llama3",
"messages": [
{ "role": "user", "content": "why is the sky blue?" }
]
@@ -378,3 +378,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
- [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
- [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
### Supported backends
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.

View File

@@ -2,6 +2,7 @@ package api
import (
"encoding/json"
"errors"
"fmt"
"math"
"os"
@@ -307,7 +308,7 @@ func (m *Metrics) Summary() {
}
}
var ErrInvalidOpts = fmt.Errorf("invalid options")
var ErrInvalidOpts = errors.New("invalid options")
func (opts *Options) FromMap(m map[string]interface{}) error {
valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct

View File

@@ -90,7 +90,7 @@ The final response in the stream also includes additional data about the generat
- `load_duration`: time spent in nanoseconds loading the model
- `prompt_eval_count`: number of tokens in the prompt
- `prompt_eval_duration`: time spent in nanoseconds evaluating the prompt
- `eval_count`: number of tokens the response
- `eval_count`: number of tokens in the response
- `eval_duration`: time in nanoseconds spent generating the response
- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
- `response`: empty if the response was streamed, if not streamed, this will contain the full response

View File

@@ -1,38 +1,15 @@
# Running Ollama on NVIDIA Jetson Devices
With some minor configuration, Ollama runs well on [NVIDIA Jetson Devices](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/). The following has been tested on [JetPack 5.1.2](https://developer.nvidia.com/embedded/jetpack).
Ollama runs well on [NVIDIA Jetson Devices](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/) and should run out of the box with the standard installation instructions.
NVIDIA Jetson devices are Linux-based embedded AI computers that are purpose-built for AI applications.
Jetsons have an integrated GPU that is wired directly to the memory controller of the machine. For this reason, the `nvidia-smi` command is unrecognized, and Ollama proceeds to operate in "CPU only"
mode. This can be verified by using a monitoring tool like jtop.
In order to address this, we simply pass the path to the Jetson's pre-installed CUDA libraries into `ollama serve` (while in a tmux session). We then hardcode the num_gpu parameters into a cloned
version of our target model.
Prerequisites:
- curl
- tmux
Here are the steps:
The following has been tested on [JetPack 5.1.2](https://developer.nvidia.com/embedded/jetpack), but should also work on JetPack 6.0.
- Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.com/install.sh | sh`
- Stop the Ollama service: `sudo systemctl stop ollama`
- Start Ollama serve in a tmux session called ollama_jetson and reference the CUDA libraries path: `tmux has-session -t ollama_jetson 2>/dev/null || tmux new-session -d -s ollama_jetson
'LD_LIBRARY_PATH=/usr/local/cuda/lib64 ollama serve'`
- Pull the model you want to use (e.g. mistral): `ollama pull mistral`
- Create a new Modelfile specifically for enabling GPU support on the Jetson: `touch ModelfileMistralJetson`
- In the ModelfileMistralJetson file, specify the FROM model and the num_gpu PARAMETER as shown below:
```
FROM mistral
PARAMETER num_gpu 999
```
- Create a new model from your Modelfile: `ollama create mistral-jetson -f ./ModelfileMistralJetson`
- Run the new model: `ollama run mistral-jetson`
If you run a monitoring tool like jtop you should now see that Ollama is using the Jetson's integrated GPU.
- Start an interactive session: `ollama run mistral`
And that's it!
# Running Ollama in Docker
When running GPU accelerated applications in Docker, it is highly recommended to use [dusty-nv jetson-containers repo](https://github.com/dusty-nv/jetson-containers).

View File

@@ -55,6 +55,6 @@ func getCPUMem() (memInfo, error) {
return memInfo{
TotalMemory: uint64(C.getPhysicalMemory()),
FreeMemory: 0,
DeviceCount: 0,
DeviceCount: 1,
}, nil
}

View File

@@ -39,6 +39,10 @@
#include "httplib.h"
#include "json.hpp"
#if defined(_WIN32)
#include <windows.h>
#endif
#include <cstddef>
#include <thread>
#include <chrono>
@@ -2770,8 +2774,28 @@ inline void signal_handler(int signal) {
shutdown_handler(signal);
}
int main(int argc, char **argv)
{
#if defined(_WIN32)
char* wchar_to_char(const wchar_t* wstr) {
if (wstr == nullptr) return nullptr;
// Determine the number of bytes needed for the UTF-8 string
int bytes = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, nullptr, 0, nullptr, nullptr);
char* str = new char[bytes];
// Convert the wide-character string to a UTF-8 string
WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, bytes, nullptr, nullptr);
return str;
}
int wmain(int argc, wchar_t **wargv) {
char** argv = new char*[argc];
for (int i = 0; i < argc; ++i) {
argv[i] = wchar_to_char(wargv[i]);
}
#else
int main(int argc, char **argv) {
#endif
#if SERVER_VERBOSE != 1
log_disable();
#endif
@@ -3282,6 +3306,11 @@ int main(int argc, char **argv)
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
for (int i = 0; i < argc; ++i) {
delete[] argv[i];
}
delete[] argv;
#endif
llama.queue_tasks.start_loop();
svr.stop();

View File

@@ -57,21 +57,21 @@ init_vars
git_module_setup
apply_patches
init_vars
if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
# Builds by default, allows skipping, forces build if OLLAMA_CPU_TARGET="static"
# Enables optimized Dockerfile builds using a blanket skip and targeted overrides
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}_static"
echo "Building static library"
build
fi
init_vars
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}_static"
echo "Building static library"
build
fi
# Users building from source can tune the exact flags we pass to cmake for configuring
# llama.cpp, and we'll build only 1 CPU variant in that case as the default.
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then

View File

@@ -164,7 +164,8 @@ func (ts Tensors) Layers() map[string]Layer {
for _, t := range ts {
parts := strings.Split(t.Name, ".")
if parts[0] == "blk" {
parts = parts[1:]
// join first and second part, e.g. blk.%d
parts = append([]string{fmt.Sprintf("%s.%s", parts[0], parts[1])}, parts[2:]...)
}
if _, ok := layers[parts[0]]; !ok {
@@ -380,6 +381,12 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
)
partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128
case "stablelm":
fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
partialOffload = max(
4*batch*(vocab+2*embedding),
fullOffload,
)
}
return

View File

@@ -248,13 +248,17 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
}
padding := llm.padding(offset, int64(alignment))
if _, err := rs.Seek(padding-offset, io.SeekCurrent); err != nil {
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
return err
}
for _, tensor := range llm.tensors {
padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1)
if _, err := rs.Seek(padded, io.SeekCurrent); err != nil {
if _, err := rs.Seek(int64(tensor.size()), io.SeekCurrent); err != nil {
return err
}
padding := llm.padding(int64(tensor.size()), int64(alignment))
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
return err
}
}
@@ -623,8 +627,9 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
return err
}
padding := llm.padding(offset, 32)
if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding-offset))); err != nil {
var alignment int64 = 32
padding := llm.padding(offset, alignment)
if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding))); err != nil {
return err
}
@@ -638,8 +643,8 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
return err
}
padding := llm.padding(offset, 32)
if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding-offset))); err != nil {
padding := llm.padding(offset, alignment)
if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding))); err != nil {
return err
}
}
@@ -648,5 +653,5 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
}
func (gguf) padding(offset, align int64) int64 {
return (offset + align - 1) / align * align
return (align - offset%align) % align
}

View File

@@ -17,7 +17,7 @@ import (
"github.com/ollama/ollama/gpu"
)
var errPayloadMissing = fmt.Errorf("expected payloads not included in this build of ollama")
var errPayloadMissing = errors.New("expected payloads not included in this build of ollama")
func Init() error {
payloadsDir, err := gpu.PayloadsDir()

View File

@@ -79,6 +79,9 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
graphFullOffload = graphPartialOffload
}
graphFullOffload *= uint64(info.DeviceCount)
graphPartialOffload *= uint64(info.DeviceCount)
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal := memoryMinimum + graphFullOffload
@@ -94,7 +97,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
var layerCount int
layers := ggml.Tensors().Layers()
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
memoryLayer := layers[fmt.Sprintf("%d", i)].size()
memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
// KV is proportional to the number of layers
memoryLayer += kv / ggml.KV().BlockCount()
@@ -106,7 +109,13 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
}
}
memoryLayerOutput := layers["output"].size()
var memoryLayerOutput uint64
for k, v := range layers {
if !strings.HasPrefix(k, "blk.") {
memoryLayerOutput += v.size()
}
}
memoryRequiredTotal += memoryLayerOutput
if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
@@ -121,16 +130,47 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
opts.NumGPU = layerCount
}
memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
slog.Info(
"offload to gpu",
"reallayers", opts.NumGPU,
"layers", layerCount,
"required", format.HumanBytes2(memoryRequiredTotal),
"used", format.HumanBytes2(memoryRequiredPartial),
"available", format.HumanBytes2(memoryAvailable),
"kv", format.HumanBytes2(kv),
"fulloffload", format.HumanBytes2(graphFullOffload),
"partialoffload", format.HumanBytes2(graphPartialOffload),
slog.Group(
"layers",
// actual number of layers offloaded
"real", opts.NumGPU,
// estimated number of layers that can be offloaded
"estimate", layerCount,
),
slog.Group(
"memory",
// memory available for offloading
"available", format.HumanBytes2(memoryAvailable),
slog.Group(
"required",
// memory required for full offloading
"full", format.HumanBytes2(memoryRequiredTotal),
// memory required to offload layers.estimate layers
"partial", format.HumanBytes2(memoryRequiredPartial),
// memory of KV cache
"kv", format.HumanBytes2(kv),
),
slog.Group(
"weights",
// memory of the weights
"total", format.HumanBytes2(memoryWeights),
// memory of repeating layers
"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
// memory of non-repeating layers
"nonrepeating", format.HumanBytes2(memoryLayerOutput),
),
slog.Group(
"graph",
// memory of graph when fully offloaded
"full", format.HumanBytes2(graphFullOffload),
// memory of graph when not fully offloaded
"partial", format.HumanBytes2(graphPartialOffload),
),
),
)
if len(adapters) > 1 {

View File

@@ -1137,7 +1137,7 @@ func GetSHA256Digest(r io.Reader) (string, int64) {
return fmt.Sprintf("sha256:%x", h.Sum(nil)), n
}
var errUnauthorized = fmt.Errorf("unauthorized")
var errUnauthorized = errors.New("unauthorized")
func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.ReadSeeker, regOpts *registryOptions) (*http.Response, error) {
for i := 0; i < 2; i++ {
@@ -1255,7 +1255,7 @@ func parseRegistryChallenge(authStr string) registryChallenge {
}
}
var errDigestMismatch = fmt.Errorf("digest mismatch, file must be downloaded again")
var errDigestMismatch = errors.New("digest mismatch, file must be downloaded again")
func verifyBlob(digest string) error {
fp, err := GetBlobsPath(digest)

View File

@@ -1,6 +1,7 @@
package model
import (
"fmt"
"log/slog"
"strings"
"unicode"
@@ -47,8 +48,11 @@ var (
// Digest.
func ParseDigest(s string) Digest {
typ, digest, ok := strings.Cut(s, "-")
if !ok {
typ, digest, ok = strings.Cut(s, ":")
}
if ok && isValidDigestType(typ) && isValidHex(digest) {
return Digest{s: s}
return Digest{s: fmt.Sprintf("%s-%s", typ, digest)}
}
return Digest{}
}

View File

@@ -156,7 +156,7 @@ func ParseName(s, fill string) Name {
r = Name{}
return false
}
if kind == PartExtraneous || !isValidPart(kind, part) {
if kind == PartExtraneous || !IsValidNamePart(kind, part) {
r = Name{}
return false
}
@@ -176,7 +176,7 @@ func parseMask(s string) Name {
// mask part; treat as empty but valid
return true
}
if !isValidPart(kind, part) {
if !IsValidNamePart(kind, part) {
panic(fmt.Errorf("invalid mask part %s: %q", kind, part))
}
r.parts[kind] = part
@@ -185,8 +185,8 @@ func parseMask(s string) Name {
return r
}
func MustParseName(s, defaults string) Name {
r := ParseName(s, "")
func MustParseName(s, fill string) Name {
r := ParseName(s, fill)
if !r.IsValid() {
panic("invalid Name: " + s)
}
@@ -521,6 +521,8 @@ func parts(s string) iter_Seq2[PartKind, string] {
return
}
state, j, partLen = PartModel, i, 0
case PartHost:
// noop: support for host:port
default:
yield(PartExtraneous, s[i+1:j])
return
@@ -606,7 +608,7 @@ func ParseNameFromFilepath(s, fill string) Name {
var r Name
for i := range PartBuild + 1 {
part, rest, _ := strings.Cut(s, string(filepath.Separator))
if !isValidPart(i, part) {
if !IsValidNamePart(i, part) {
return Name{}
}
r.parts[i] = part
@@ -643,9 +645,21 @@ func (r Name) Filepath() string {
return filepath.Join(r.parts[:]...)
}
// isValidPart reports if s contains all valid characters for the given
// part kind.
func isValidPart(kind PartKind, s string) bool {
// FilepathNoBuild returns a complete, canonicalized, relative file path using
// the parts of a complete Name, but without the build part.
func (r Name) FilepathNoBuild() string {
for i := range PartBuild {
r.parts[i] = strings.ToLower(r.parts[i])
}
return filepath.Join(r.parts[:PartBuild]...)
}
// IsValidNamePart reports if s contains all valid characters for the given
// part kind and is under MaxNamePartLen bytes.
func IsValidNamePart(kind PartKind, s string) bool {
if len(s) > MaxNamePartLen {
return false
}
if s == "" {
return false
}
@@ -669,6 +683,9 @@ func isValidByteFor(kind PartKind, c byte) bool {
if kind == PartNamespace && c == '.' {
return false
}
if kind == PartHost && c == ':' {
return true
}
if c == '.' || c == '-' {
return true
}

View File

@@ -40,6 +40,7 @@ var testNames = map[string]fields{
"user/model": {namespace: "user", model: "model"},
"example.com/ns/mistral:7b+Q4_0": {host: "example.com", namespace: "ns", model: "mistral", tag: "7b", build: "Q4_0"},
"example.com/ns/mistral:7b+X": {host: "example.com", namespace: "ns", model: "mistral", tag: "7b", build: "X"},
"localhost:5000/ns/mistral": {host: "localhost:5000", namespace: "ns", model: "mistral"},
// invalid digest
"mistral:latest@invalid256-": {},
@@ -104,6 +105,12 @@ var testNames = map[string]fields{
strings.Repeat("a", MaxNamePartLen+1): {},
}
func TestIsValidNameLen(t *testing.T) {
if IsValidNamePart(PartNamespace, strings.Repeat("a", MaxNamePartLen+1)) {
t.Errorf("unexpectedly valid long name")
}
}
// TestConsecutiveDots tests that consecutive dots are not allowed in any
// part, to avoid path traversal. There also are some tests in testNames, but
// this test is more exhaustive and exists to emphasize the importance of
@@ -318,6 +325,13 @@ func TestNameGoString(t *testing.T) {
}
}
func TestDisplayLongest(t *testing.T) {
g := ParseName("example.com/library/mistral:latest+Q4_0", FillNothing).DisplayLongest()
if g != "example.com/library/mistral:latest" {
t.Errorf("got = %q; want %q", g, "example.com/library/mistral:latest")
}
}
func TestDisplayShortest(t *testing.T) {
cases := []struct {
in string
@@ -478,30 +492,36 @@ func TestNamePath(t *testing.T) {
}
}
func TestNameFromFilepath(t *testing.T) {
func TestNameFilepath(t *testing.T) {
cases := []struct {
in string
want string
in string
want string
wantNoBuild string
}{
{
in: "example.com/library/mistral:latest+Q4_0",
want: "example.com/library/mistral/latest/Q4_0",
in: "example.com/library/mistral:latest+Q4_0",
want: "example.com/library/mistral/latest/Q4_0",
wantNoBuild: "example.com/library/mistral/latest",
},
{
in: "Example.Com/Library/Mistral:Latest+Q4_0",
want: "example.com/library/mistral/latest/Q4_0",
in: "Example.Com/Library/Mistral:Latest+Q4_0",
want: "example.com/library/mistral/latest/Q4_0",
wantNoBuild: "example.com/library/mistral/latest",
},
{
in: "Example.Com/Library/Mistral:Latest+Q4_0",
want: "example.com/library/mistral/latest/Q4_0",
in: "Example.Com/Library/Mistral:Latest+Q4_0",
want: "example.com/library/mistral/latest/Q4_0",
wantNoBuild: "example.com/library/mistral/latest",
},
{
in: "example.com/library/mistral:latest",
want: "example.com/library/mistral/latest",
in: "example.com/library/mistral:latest",
want: "example.com/library/mistral/latest",
wantNoBuild: "example.com/library/mistral/latest",
},
{
in: "",
want: "",
in: "",
want: "",
wantNoBuild: "",
},
}
for _, tt := range cases {
@@ -513,6 +533,11 @@ func TestNameFromFilepath(t *testing.T) {
if g != tt.want {
t.Errorf("got = %q; want %q", g, tt.want)
}
g = p.FilepathNoBuild()
g = filepath.ToSlash(g)
if g != tt.wantNoBuild {
t.Errorf("got = %q; want %q", g, tt.wantNoBuild)
}
})
}
}