mirror of
https://github.com/ollama/ollama.git
synced 2026-01-10 00:18:05 -05:00
Compare commits
53 Commits
jmorganca/
...
bmizerany/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d9ea2e5c7a | ||
|
|
5ce997a7b9 | ||
|
|
672ffe9b7d | ||
|
|
47cfe58af5 | ||
|
|
e72c567cfd | ||
|
|
3e22611200 | ||
|
|
a54d4a28dc | ||
|
|
82b0c7c27e | ||
|
|
ba7cf7fb66 | ||
|
|
2f804068bd | ||
|
|
34d00f90b1 | ||
|
|
b53229a2ed | ||
|
|
53c107e20e | ||
|
|
51578d8573 | ||
|
|
b5fcd9d3aa | ||
|
|
b80661e8c7 | ||
|
|
6d3adfbea2 | ||
|
|
369eda65f5 | ||
|
|
f878e91070 | ||
|
|
0d651478e4 | ||
|
|
9ea492f1ce | ||
|
|
bc13da2bfe | ||
|
|
41b00b9856 | ||
|
|
c2a8ed48e7 | ||
|
|
3dc1bb6a35 | ||
|
|
7865a6996a | ||
|
|
00ec269321 | ||
|
|
908005d90b | ||
|
|
cdf65e793f | ||
|
|
82ca694d68 | ||
|
|
5017a15bcb | ||
|
|
e11668aa07 | ||
|
|
0bd0f4a29c | ||
|
|
1ffb1e2874 | ||
|
|
0a7844413c | ||
|
|
f9cd55c70b | ||
|
|
0fdebb34a9 | ||
|
|
ac64cd4ef9 | ||
|
|
4a5c9b8035 | ||
|
|
efe5617b64 | ||
|
|
5b3fad9636 | ||
|
|
bfec2c6e10 | ||
|
|
5c143af726 | ||
|
|
6c0af2599e | ||
|
|
fc8c044584 | ||
|
|
ecc133d843 | ||
|
|
76bdebbadf | ||
|
|
18979ad4a1 | ||
|
|
8e0ef931d8 | ||
|
|
280da44522 | ||
|
|
0cebc79cba | ||
|
|
b1e74d4fda | ||
|
|
69f0227813 |
@@ -1,8 +1,9 @@
|
||||
.vscode
|
||||
ollama
|
||||
app
|
||||
macapp
|
||||
dist
|
||||
llm/llama.cpp
|
||||
.env
|
||||
.cache
|
||||
test_data
|
||||
test_data
|
||||
|
||||
@@ -46,7 +46,7 @@ RUN mkdir /tmp/scratch && \
|
||||
done && \
|
||||
(cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
|
||||
mkdir -p /go/src/github.com/jmorganca/ollama/dist/deps/ && \
|
||||
(cd /tmp/scratch/ && tar czvf /go/src/github.com/jmorganca/ollama/dist/deps/rocm-amd64-deps.tgz . )
|
||||
(cd /tmp/scratch/ && tar czvf /go/src/github.com/jmorganca/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )
|
||||
|
||||
|
||||
FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
|
||||
@@ -92,7 +92,7 @@ COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/b
|
||||
COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/dist/deps/ ./dist/deps/
|
||||
ARG GOFLAGS
|
||||
ARG CGO_CFLAGS
|
||||
RUN go build .
|
||||
RUN go build -trimpath .
|
||||
|
||||
# Intermediate stage used for ./scripts/build_linux.sh
|
||||
FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
|
||||
@@ -103,7 +103,7 @@ COPY . .
|
||||
COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
|
||||
ARG GOFLAGS
|
||||
ARG CGO_CFLAGS
|
||||
RUN go build .
|
||||
RUN go build -trimpath .
|
||||
|
||||
# Runtime stages
|
||||
FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
|
||||
|
||||
50
api/types_test.go
Normal file
50
api/types_test.go
Normal file
@@ -0,0 +1,50 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"math"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestKeepAliveParsingFromJSON(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
req string
|
||||
exp *Duration
|
||||
}{
|
||||
{
|
||||
name: "Positive Integer",
|
||||
req: `{ "keep_alive": 42 }`,
|
||||
exp: &Duration{42 * time.Second},
|
||||
},
|
||||
{
|
||||
name: "Positive Integer String",
|
||||
req: `{ "keep_alive": "42m" }`,
|
||||
exp: &Duration{42 * time.Minute},
|
||||
},
|
||||
{
|
||||
name: "Negative Integer",
|
||||
req: `{ "keep_alive": -1 }`,
|
||||
exp: &Duration{math.MaxInt64},
|
||||
},
|
||||
{
|
||||
name: "Negative Integer String",
|
||||
req: `{ "keep_alive": "-1m" }`,
|
||||
exp: &Duration{math.MaxInt64},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
var dec ChatRequest
|
||||
err := json.Unmarshal([]byte(test.req), &dec)
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, test.exp, dec.KeepAlive)
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -970,9 +970,10 @@ func NewCLI() *cobra.Command {
|
||||
serveCmd.SetUsageTemplate(serveCmd.UsageTemplate() + `
|
||||
Environment Variables:
|
||||
|
||||
OLLAMA_HOST The host:port to bind to (default "127.0.0.1:11434")
|
||||
OLLAMA_ORIGINS A comma separated list of allowed origins.
|
||||
OLLAMA_MODELS The path to the models directory (default is "~/.ollama/models")
|
||||
OLLAMA_HOST The host:port to bind to (default "127.0.0.1:11434")
|
||||
OLLAMA_ORIGINS A comma separated list of allowed origins.
|
||||
OLLAMA_MODELS The path to the models directory (default is "~/.ollama/models")
|
||||
OLLAMA_KEEP_ALIVE The duration that models stay loaded in memory (default is "5m")
|
||||
`)
|
||||
|
||||
pullCmd := &cobra.Command{
|
||||
|
||||
@@ -103,16 +103,16 @@ func ReadSafeTensors(fn string, offset uint64) ([]llm.Tensor, uint64, error) {
|
||||
return []llm.Tensor{}, 0, err
|
||||
}
|
||||
|
||||
shape := [4]uint64{0, 0, 0, 0}
|
||||
for cnt, s := range data.Shape {
|
||||
shape[cnt] = uint64(s)
|
||||
shape := []uint64{0, 0, 0, 0}
|
||||
for i := range data.Shape {
|
||||
shape[i] = uint64(data.Shape[i])
|
||||
}
|
||||
|
||||
t := llm.Tensor{
|
||||
Name: ggufName,
|
||||
Kind: kind,
|
||||
Offset: offset,
|
||||
Shape: shape,
|
||||
Shape: shape[:],
|
||||
FileName: fn,
|
||||
OffsetPadding: 8 + jsonSize,
|
||||
FileOffsets: []uint64{uint64(data.Offsets[0]), uint64(data.Offsets[1])},
|
||||
|
||||
@@ -1,25 +1,21 @@
|
||||
# Documentation
|
||||
|
||||
To get started, see the project's **[quickstart](../README.md#quickstart)**.
|
||||
### Getting Started
|
||||
* [Quickstart](../README.md#quickstart)
|
||||
* [Examples](../examples)
|
||||
* [Importing models](./import.md)
|
||||
* [Linux Documentation](./linux.md)
|
||||
* [Windows Documentation](./windows.md)
|
||||
* [Docker Documentation](https://hub.docker.com/r/ollama/ollama)
|
||||
|
||||
Ollama is a tool for running AI models on your hardware. Many users will choose to use the Command Line Interface (CLI) to work with Ollama. Learn more about all the commands in the CLI in the **[Main Readme](../README.md)**.
|
||||
### Reference
|
||||
|
||||
Use the RESTful API using any language, including Python, JavaScript, Typescript, Go, Rust, and many more. Learn more about using the API in the **[API Documentation](./api.md)**.
|
||||
* [API Reference](./api.md)
|
||||
* [Modelfile Reference](./modelfile.md)
|
||||
* [OpenAI Compatibility](./openai.md)
|
||||
|
||||
Create new models or modify models already in the library using the Modelfile. Learn more about the Modelfile syntax in the **[Modelfile Documentation](./modelfile.md)**.
|
||||
### Resources
|
||||
|
||||
Import models using source model weights found on Hugging Face and similar sites by referring to the **[Import Documentation](./import.md)**.
|
||||
|
||||
Installing on Linux in most cases is easy using the script on [ollama.com/download](ollama.com/download). To get more detail about the install, including CUDA drivers, see the **[Linux Documentation](./linux.md)**.
|
||||
|
||||
Many of our users like the flexibility of using our official Docker Image. Learn more about using Docker with Ollama using the **[Docker Documentation](https://hub.docker.com/r/ollama/ollama)**.
|
||||
|
||||
It is easy to install on Linux and Mac, but many users will choose to build Ollama on their own. To do this, refer to the **[Development Documentation](./development.md)**.
|
||||
|
||||
If encountering a problem with Ollama, the best place to start is the logs. Find more information about them here in the **[Troubleshooting Guide](./troubleshooting.md)**.
|
||||
|
||||
Finally for all the questions that don't fit anywhere else, there is the **[FAQ](./faq.md)**
|
||||
|
||||
[Tutorials](./tutorials.md) apply the documentation to tasks.
|
||||
|
||||
For working code examples of using Ollama, see [Examples](../examples).
|
||||
* [Troubleshooting Guide](./troubleshooting.md)
|
||||
* [FAQ](./faq.md)
|
||||
* [Development guide](./development.md)
|
||||
|
||||
@@ -135,3 +135,10 @@ go build .
|
||||
In addition to the common Windows development tools described above, install CUDA **AFTER** you install MSVC.
|
||||
|
||||
- [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
|
||||
|
||||
|
||||
#### Windows ROCm (AMD Radeon)
|
||||
|
||||
In addition to the common Windows development tools described above, install AMDs HIP package **AFTER** you install MSVC
|
||||
|
||||
- [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
|
||||
10
docs/faq.md
10
docs/faq.md
@@ -193,3 +193,13 @@ To unload the model and free up memory use:
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": 0}'
|
||||
```
|
||||
|
||||
## Controlling which GPUs to use
|
||||
|
||||
By default, on Linux and Windows, Ollama will attempt to use Nvidia GPUs, or
|
||||
Radeon GPUs, and will use all the GPUs it can find. You can limit which GPUs
|
||||
will be utilized by setting the environment variable `CUDA_VISIBLE_DEVICES` for
|
||||
NVIDIA cards, or `HIP_VISIBLE_DEVICES` for Radeon GPUs to a comma delimited list
|
||||
of GPU IDs. You can see the list of devices with GPU tools such as `nvidia-smi` or
|
||||
`rocminfo`. You can set to an invalid GPU ID (e.g., "-1") to bypass the GPU and
|
||||
fallback to CPU.
|
||||
@@ -72,6 +72,11 @@ Verify that the drivers are installed by running the following command, which sh
|
||||
nvidia-smi
|
||||
```
|
||||
|
||||
### Install ROCm (optional - for Radeon GPUs)
|
||||
[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html)
|
||||
|
||||
Make sure to install ROCm v6
|
||||
|
||||
### Start Ollama
|
||||
|
||||
Start Ollama using `systemd`:
|
||||
|
||||
@@ -131,7 +131,7 @@ The `PARAMETER` instruction defines a parameter that can be set when the model i
|
||||
PARAMETER <parameter> <parametervalue>
|
||||
```
|
||||
|
||||
### Valid Parameters and Values
|
||||
#### Valid Parameters and Values
|
||||
|
||||
| Parameter | Description | Value Type | Example Usage |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
|
||||
@@ -201,7 +201,22 @@ LICENSE """
|
||||
|
||||
### MESSAGE
|
||||
|
||||
The `MESSAGE` instruction allows you to specify a message history for the model to use when responding:
|
||||
The `MESSAGE` instruction allows you to specify a message history for the model to use when responding. Use multiple iterations of the MESSAGE command to build up a conversation which will guide the model to answer in a similar way.
|
||||
|
||||
```modelfile
|
||||
MESSAGE <role> <message>
|
||||
```
|
||||
|
||||
#### Valid roles
|
||||
|
||||
| Role | Description |
|
||||
| --------- | ------------------------------------------------------------ |
|
||||
| system | Alternate way of providing the SYSTEM message for the model. |
|
||||
| user | An example message of what the user could have asked. |
|
||||
| assistant | An example message of how the model should respond. |
|
||||
|
||||
|
||||
#### Example conversation
|
||||
|
||||
```modelfile
|
||||
MESSAGE user Is Toronto in Canada?
|
||||
@@ -212,6 +227,7 @@ MESSAGE user Is Ontario in Canada?
|
||||
MESSAGE assistant yes
|
||||
```
|
||||
|
||||
|
||||
## Notes
|
||||
|
||||
- the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.
|
||||
|
||||
@@ -70,30 +70,36 @@ cat /proc/cpuinfo| grep flags | head -1
|
||||
## AMD Radeon GPU Support
|
||||
|
||||
Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
|
||||
some cases you can force the system to try to use a close GPU type. For example
|
||||
The Radeon RX 5400 is `gfx1034` (also known as 10.3.4) however, ROCm does not
|
||||
support this patch-level, the closest support is `gfx1030`. You can use the
|
||||
environment variable `HSA_OVERRIDE_GFX_VERSION` with `x.y.z` syntax. So for
|
||||
example, to force the system to run on the RX 5400, you would set
|
||||
`HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the server.
|
||||
some cases you can force the system to try to use a similar LLVM target that is
|
||||
close. For example The Radeon RX 5400 is `gfx1034` (also known as 10.3.4)
|
||||
however, ROCm does not currently support this target. The closest support is
|
||||
`gfx1030`. You can use the environment variable `HSA_OVERRIDE_GFX_VERSION` with
|
||||
`x.y.z` syntax. So for example, to force the system to run on the RX 5400, you
|
||||
would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the
|
||||
server. If you have an unsupported AMD GPU you can experiment using the list of
|
||||
supported types below.
|
||||
|
||||
At this time, the known supported GPU types are the following: (This may change from
|
||||
release to release)
|
||||
- gfx900
|
||||
- gfx906
|
||||
- gfx908
|
||||
- gfx90a
|
||||
- gfx940
|
||||
- gfx941
|
||||
- gfx942
|
||||
- gfx1030
|
||||
- gfx1100
|
||||
- gfx1101
|
||||
- gfx1102
|
||||
At this time, the known supported GPU types are the following LLVM Targets.
|
||||
This table shows some example GPUs that map to these LLVM targets:
|
||||
| **LLVM Target** | **An Example GPU** |
|
||||
|-----------------|---------------------|
|
||||
| gfx900 | Radeon RX Vega 56 |
|
||||
| gfx906 | Radeon Instinct MI50 |
|
||||
| gfx908 | Radeon Instinct MI100 |
|
||||
| gfx90a | Radeon Instinct MI210 |
|
||||
| gfx940 | Radeon Instinct MI300 |
|
||||
| gfx941 | |
|
||||
| gfx942 | |
|
||||
| gfx1030 | Radeon PRO V620 |
|
||||
| gfx1100 | Radeon PRO W7900 |
|
||||
| gfx1101 | Radeon PRO W7700 |
|
||||
| gfx1102 | Radeon RX 7600 |
|
||||
|
||||
This will not work for all unsupported GPUs. Reach out on [Discord](https://discord.gg/ollama)
|
||||
or file an [issue](https://github.com/ollama/ollama/issues) for additional help.
|
||||
AMD is working on enhancing ROCm v6 to broaden support for families of GPUs in a
|
||||
future release which should increase support for more GPUs.
|
||||
|
||||
Reach out on [Discord](https://discord.gg/ollama) or file an
|
||||
[issue](https://github.com/ollama/ollama/issues) for additional help.
|
||||
|
||||
## Installing older versions on Linux
|
||||
|
||||
@@ -103,7 +109,3 @@ which version to install.
|
||||
```sh
|
||||
curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.27" sh
|
||||
```
|
||||
|
||||
## Known issues
|
||||
|
||||
* N/A
|
||||
@@ -40,19 +40,17 @@ func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
|
||||
// TODO - does sort order matter?
|
||||
devices := []string{}
|
||||
for i := range ids {
|
||||
slog.Debug(fmt.Sprintf("i=%d", i))
|
||||
if _, skipped := skip[i]; skipped {
|
||||
slog.Debug("skipped")
|
||||
continue
|
||||
}
|
||||
devices = append(devices, strconv.Itoa(i))
|
||||
}
|
||||
slog.Debug(fmt.Sprintf("devices=%v", devices))
|
||||
|
||||
val := strings.Join(devices, ",")
|
||||
err := os.Setenv("HIP_VISIBLE_DEVICES", val)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("failed to set env: %s", err))
|
||||
} else {
|
||||
slog.Info("Setting HIP_VISIBLE_DEVICES=" + val)
|
||||
}
|
||||
slog.Debug("HIP_VISIBLE_DEVICES=" + val)
|
||||
}
|
||||
|
||||
@@ -11,14 +11,11 @@ import (
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/jmorganca/ollama/version"
|
||||
)
|
||||
|
||||
// Discovery logic for AMD/ROCm GPUs
|
||||
|
||||
const (
|
||||
curlMsg = "curl -fsSL https://github.com/ollama/ollama/releases/download/v%s/rocm-amd64-deps.tgz | tar -zxf - -C %s"
|
||||
DriverVersionFile = "/sys/module/amdgpu/version"
|
||||
AMDNodesSysfsDir = "/sys/class/kfd/kfd/topology/nodes/"
|
||||
GPUPropertiesFileGlob = AMDNodesSysfsDir + "*/properties"
|
||||
@@ -27,6 +24,9 @@ const (
|
||||
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
|
||||
GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
|
||||
RocmStandardLocation = "/opt/rocm/lib"
|
||||
|
||||
// TODO find a better way to detect iGPU instead of minimum memory
|
||||
IGPUMemLimit = 1024 * 1024 * 1024 // 512G is what they typically report, so anything less than 1G must be iGPU
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -149,8 +149,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||
resp.memInfo.DeviceCount = 0
|
||||
resp.memInfo.TotalMemory = 0
|
||||
resp.memInfo.FreeMemory = 0
|
||||
slog.Debug("discovering VRAM for amdgpu devices")
|
||||
if len(ids) == 0 {
|
||||
slog.Debug("discovering all amdgpu devices")
|
||||
entries, err := os.ReadDir(AMDNodesSysfsDir)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
|
||||
@@ -168,7 +168,7 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||
ids = append(ids, id)
|
||||
}
|
||||
}
|
||||
slog.Debug(fmt.Sprintf("discovering amdgpu devices %v", ids))
|
||||
slog.Debug(fmt.Sprintf("amdgpu devices %v", ids))
|
||||
|
||||
for _, id := range ids {
|
||||
if _, skipped := skip[id]; skipped {
|
||||
@@ -176,7 +176,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||
}
|
||||
totalMemory := uint64(0)
|
||||
usedMemory := uint64(0)
|
||||
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUTotalMemoryFileGlob)
|
||||
// Adjust for sysfs vs HIP ids
|
||||
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id+1), GPUTotalMemoryFileGlob)
|
||||
propFiles, err := filepath.Glob(propGlob)
|
||||
if err != nil {
|
||||
slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
|
||||
@@ -208,6 +209,13 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||
}
|
||||
}
|
||||
if totalMemory == 0 {
|
||||
slog.Warn(fmt.Sprintf("amdgpu [%d] reports zero total memory, skipping", id))
|
||||
skip[id] = struct{}{}
|
||||
continue
|
||||
}
|
||||
if totalMemory < IGPUMemLimit {
|
||||
slog.Info(fmt.Sprintf("amdgpu [%d] appears to be an iGPU with %dM reported total memory, skipping", id, totalMemory/1024/1024))
|
||||
skip[id] = struct{}{}
|
||||
continue
|
||||
}
|
||||
usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
|
||||
@@ -235,8 +243,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||
}
|
||||
usedMemory += used
|
||||
}
|
||||
slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %d", id, totalMemory))
|
||||
slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory %d", id, (totalMemory - usedMemory)))
|
||||
slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %dM", id, totalMemory/1024/1024))
|
||||
slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory %dM", id, (totalMemory-usedMemory)/1024/1024))
|
||||
resp.memInfo.DeviceCount++
|
||||
resp.memInfo.TotalMemory += totalMemory
|
||||
resp.memInfo.FreeMemory += (totalMemory - usedMemory)
|
||||
@@ -278,22 +286,37 @@ func setupLink(source, target string) error {
|
||||
func AMDValidateLibDir() (string, error) {
|
||||
// We rely on the rpath compiled into our library to find rocm
|
||||
// so we establish a symlink to wherever we find it on the system
|
||||
// to $AssetsDir/rocm
|
||||
// to <payloads>/rocm
|
||||
payloadsDir, err := PayloadsDir()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// If we already have a rocm dependency wired, nothing more to do
|
||||
assetsDir, err := AssetsDir()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("unable to lookup lib dir: %w", err)
|
||||
}
|
||||
// Versioned directory
|
||||
rocmTargetDir := filepath.Join(assetsDir, "rocm")
|
||||
rocmTargetDir := filepath.Clean(filepath.Join(payloadsDir, "..", "rocm"))
|
||||
if rocmLibUsable(rocmTargetDir) {
|
||||
return rocmTargetDir, nil
|
||||
}
|
||||
// Parent dir (unversioned)
|
||||
commonRocmDir := filepath.Join(filepath.Dir(assetsDir), "rocm")
|
||||
if rocmLibUsable(commonRocmDir) {
|
||||
return rocmTargetDir, setupLink(commonRocmDir, rocmTargetDir)
|
||||
|
||||
// next to the running binary
|
||||
exe, err := os.Executable()
|
||||
if err == nil {
|
||||
peerDir := filepath.Dir(exe)
|
||||
if rocmLibUsable(peerDir) {
|
||||
slog.Debug("detected ROCM next to ollama executable " + peerDir)
|
||||
return rocmTargetDir, setupLink(peerDir, rocmTargetDir)
|
||||
}
|
||||
peerDir = filepath.Join(filepath.Dir(exe), "rocm")
|
||||
if rocmLibUsable(peerDir) {
|
||||
slog.Debug("detected ROCM next to ollama executable " + peerDir)
|
||||
return rocmTargetDir, setupLink(peerDir, rocmTargetDir)
|
||||
}
|
||||
}
|
||||
|
||||
// Well known ollama installer path
|
||||
installedRocmDir := "/usr/share/ollama/lib/rocm"
|
||||
if rocmLibUsable(installedRocmDir) {
|
||||
return rocmTargetDir, setupLink(installedRocmDir, rocmTargetDir)
|
||||
}
|
||||
|
||||
// Prefer explicit HIP env var
|
||||
@@ -322,14 +345,9 @@ func AMDValidateLibDir() (string, error) {
|
||||
if rocmLibUsable("/opt/rocm/lib") {
|
||||
return rocmTargetDir, setupLink("/opt/rocm/lib", rocmTargetDir)
|
||||
}
|
||||
err = os.MkdirAll(rocmTargetDir, 0755)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to create empty rocm dir %s %w", rocmTargetDir, err)
|
||||
}
|
||||
|
||||
// If we still haven't found a usable rocm, the user will have to download it on their own
|
||||
slog.Warn("amdgpu detected, but no compatible rocm library found. Either install rocm v6, or run the following")
|
||||
slog.Warn(fmt.Sprintf(curlMsg, version.Version, rocmTargetDir))
|
||||
// If we still haven't found a usable rocm, the user will have to install it on their own
|
||||
slog.Warn("amdgpu detected, but no compatible rocm library found. Either install rocm v6, or follow manual install instructions at https://github.com/ollama/ollama/blob/main/docs/linux.md#manual-install")
|
||||
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
|
||||
}
|
||||
|
||||
@@ -351,6 +369,8 @@ func AMDDriverVersion() (string, error) {
|
||||
}
|
||||
|
||||
func AMDGFXVersions() map[int]Version {
|
||||
// The amdgpu driver always exposes the host CPU as node 0, but we have to skip that and subtract one
|
||||
// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
|
||||
res := map[int]Version{}
|
||||
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
|
||||
for _, match := range matches {
|
||||
@@ -366,17 +386,20 @@ func AMDGFXVersions() map[int]Version {
|
||||
continue
|
||||
}
|
||||
|
||||
if i == 0 {
|
||||
// Skipping the CPU
|
||||
continue
|
||||
}
|
||||
// Align with HIP IDs (zero is first GPU, not CPU)
|
||||
i -= 1
|
||||
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
if strings.HasPrefix(line, "gfx_target_version") {
|
||||
ver := strings.Fields(line)
|
||||
if len(ver) != 2 || len(ver[1]) < 5 {
|
||||
|
||||
if ver[1] == "0" {
|
||||
// Silently skip the CPU
|
||||
continue
|
||||
} else {
|
||||
if ver[1] != "0" {
|
||||
slog.Debug("malformed " + line)
|
||||
}
|
||||
res[i] = Version{
|
||||
|
||||
@@ -140,7 +140,7 @@ func AMDValidateLibDir() (string, error) {
|
||||
// $LibDir/rocm, we instead rely on setting PATH to point
|
||||
// to the location of the ROCm library
|
||||
|
||||
// Installer payload location
|
||||
// Installer payload location if we're running the installed binary
|
||||
exe, err := os.Executable()
|
||||
if err == nil {
|
||||
rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
|
||||
@@ -150,13 +150,12 @@ func AMDValidateLibDir() (string, error) {
|
||||
}
|
||||
}
|
||||
|
||||
// If we already have a rocm dependency wired, nothing more to do
|
||||
libDir, err := AssetsDir()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("unable to lookup lib dir: %w", err)
|
||||
}
|
||||
rocmTargetDir := filepath.Join(libDir, "rocm")
|
||||
// Installer payload (if we're running from some other location)
|
||||
localAppData := os.Getenv("LOCALAPPDATA")
|
||||
appDir := filepath.Join(localAppData, "Programs", "Ollama")
|
||||
rocmTargetDir := filepath.Join(appDir, "rocm")
|
||||
if rocmLibUsable(rocmTargetDir) {
|
||||
slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
|
||||
return rocmTargetDir, nil
|
||||
}
|
||||
|
||||
@@ -175,16 +174,7 @@ func AMDValidateLibDir() (string, error) {
|
||||
return RocmStandardLocation, nil
|
||||
}
|
||||
|
||||
// Installer payload (if we're running from some other location)
|
||||
localAppData := os.Getenv("LOCALAPPDATA")
|
||||
appDir := filepath.Join(localAppData, "Programs", "Ollama")
|
||||
rocmTargetDir = filepath.Join(appDir, "rocm")
|
||||
if rocmLibUsable(rocmTargetDir) {
|
||||
slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
|
||||
return rocmTargetDir, nil
|
||||
}
|
||||
|
||||
// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
|
||||
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm v6")
|
||||
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
|
||||
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
|
||||
}
|
||||
|
||||
@@ -7,15 +7,41 @@ import (
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
"sync"
|
||||
)
|
||||
|
||||
func AssetsDir() (string, error) {
|
||||
home, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
var (
|
||||
lock sync.Mutex
|
||||
payloadsDir = ""
|
||||
)
|
||||
|
||||
return filepath.Join(home, ".ollama", "assets"), nil
|
||||
func PayloadsDir() (string, error) {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
if payloadsDir == "" {
|
||||
tmpDir, err := os.MkdirTemp("", "ollama")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
|
||||
}
|
||||
// We create a distinct subdirectory for payloads within the tmpdir
|
||||
// This will typically look like /tmp/ollama3208993108/runners on linux
|
||||
payloadsDir = filepath.Join(tmpDir, "runners")
|
||||
}
|
||||
return payloadsDir, nil
|
||||
}
|
||||
|
||||
func Cleanup() {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
if payloadsDir != "" {
|
||||
// We want to fully clean up the tmpdir parent of the payloads dir
|
||||
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
|
||||
slog.Debug("cleaning up", "dir", tmpDir)
|
||||
err := os.RemoveAll(tmpDir)
|
||||
if err != nil {
|
||||
slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func UpdatePath(dir string) {
|
||||
|
||||
@@ -155,8 +155,8 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
|
||||
}
|
||||
}
|
||||
|
||||
LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
|
||||
LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.used);
|
||||
LOG(h.verbose, "[%d] CUDA totalMem %llu\n", i, memInfo.total);
|
||||
LOG(h.verbose, "[%d] CUDA usedMem %llu\n", i, memInfo.used);
|
||||
|
||||
resp->total += memInfo.total;
|
||||
resp->free += memInfo.free;
|
||||
|
||||
@@ -149,7 +149,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
|
||||
|
||||
slog.Info("Initializing llama server")
|
||||
slog.Debug(fmt.Sprintf("server params: %+v", sparams))
|
||||
initResp := newExtServerResp(128)
|
||||
initResp := newExtServerResp(512)
|
||||
defer freeExtServerResp(initResp)
|
||||
C.dyn_llama_server_init(llm.s, &sparams, &initResp)
|
||||
if initResp.id < 0 {
|
||||
@@ -198,6 +198,9 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
|
||||
|
||||
if predict.Format == "json" {
|
||||
request["grammar"] = jsonGrammar
|
||||
if !strings.Contains(strings.ToLower(predict.Prompt), "json") {
|
||||
slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
|
||||
}
|
||||
}
|
||||
|
||||
retryDelay := 100 * time.Microsecond
|
||||
@@ -225,17 +228,14 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
|
||||
}
|
||||
|
||||
retryNeeded := false
|
||||
// keep track of the last token generated, this is used to abort if the model starts looping
|
||||
var lastToken string
|
||||
var tokenRepeat int
|
||||
out:
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
// This handles the request cancellation
|
||||
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
|
||||
if resp.id < 0 {
|
||||
return extServerResponseToErr(resp)
|
||||
} else {
|
||||
return nil
|
||||
}
|
||||
return cancelCompletion(llm, resp)
|
||||
default:
|
||||
var result C.ext_server_task_result_t
|
||||
C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
|
||||
@@ -258,6 +258,20 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
|
||||
break out
|
||||
}
|
||||
|
||||
switch {
|
||||
case strings.TrimSpace(p.Content) == lastToken:
|
||||
tokenRepeat++
|
||||
default:
|
||||
lastToken = strings.TrimSpace(p.Content)
|
||||
tokenRepeat = 0
|
||||
}
|
||||
|
||||
// 30 picked as an arbitrary max token repeat limit, modify as needed
|
||||
if tokenRepeat > 30 {
|
||||
slog.Debug("prediction aborted, token repeat limit reached")
|
||||
return cancelCompletion(llm, resp)
|
||||
}
|
||||
|
||||
if p.Content != "" {
|
||||
fn(PredictResult{
|
||||
Content: p.Content,
|
||||
@@ -285,6 +299,15 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
|
||||
return fmt.Errorf("max retries exceeded")
|
||||
}
|
||||
|
||||
func cancelCompletion(llm *dynExtServer, resp C.ext_server_resp_t) error {
|
||||
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
|
||||
if resp.id < 0 {
|
||||
return extServerResponseToErr(resp)
|
||||
} else {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
|
||||
data, err := json.Marshal(TokenizeRequest{Content: prompt})
|
||||
if err != nil {
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
// Expose the llama server as a callable extern "C" API
|
||||
server_context *llama = NULL;
|
||||
llama_server_context *llama = NULL;
|
||||
std::thread ext_server_thread;
|
||||
bool shutting_down = false;
|
||||
std::atomic_int recv_counter;
|
||||
@@ -57,7 +57,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
|
||||
err->id = 0;
|
||||
err->msg[0] = '\0';
|
||||
try {
|
||||
llama = new server_context;
|
||||
llama = new llama_server_context;
|
||||
gpt_params params;
|
||||
params.n_ctx = sparams->n_ctx;
|
||||
params.n_batch = sparams->n_batch;
|
||||
@@ -114,16 +114,12 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
// load the model
|
||||
if (!llama->load_model(params)) {
|
||||
// TODO - consider modifying the logging logic or patching load_model so
|
||||
// we can capture more detailed error messages and pass them back to the
|
||||
// caller for better UX
|
||||
err->id = -1;
|
||||
snprintf(err->msg, err->msg_len, "error loading model %s",
|
||||
params.model.c_str());
|
||||
return;
|
||||
}
|
||||
if (!llama->load_model(params)) {
|
||||
// an error occurred that was not thrown
|
||||
err->id = -1;
|
||||
snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
llama->initialize();
|
||||
} catch (std::exception &e) {
|
||||
@@ -144,13 +140,13 @@ void llama_server_start() {
|
||||
LOG_TEE("llama server main loop starting\n");
|
||||
ggml_time_init();
|
||||
llama->queue_tasks.on_new_task(std::bind(
|
||||
&server_context::process_single_task, llama, std::placeholders::_1));
|
||||
&llama_server_context::process_single_task, llama, std::placeholders::_1));
|
||||
llama->queue_tasks.on_finish_multitask(std::bind(
|
||||
&server_context::on_finish_multitask, llama, std::placeholders::_1));
|
||||
&llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
|
||||
llama->queue_tasks.on_run_slots(std::bind(
|
||||
&server_context::update_slots, llama));
|
||||
&llama_server_context::update_slots, llama));
|
||||
llama->queue_results.on_multitask_update(std::bind(
|
||||
&server_queue::update_multitask,
|
||||
&llama_server_queue::update_multitask,
|
||||
&llama->queue_tasks,
|
||||
std::placeholders::_1,
|
||||
std::placeholders::_2,
|
||||
@@ -198,7 +194,7 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
|
||||
json data = json::parse(json_req);
|
||||
resp->id = llama->queue_tasks.get_new_id();
|
||||
llama->queue_results.add_waiting_task_id(resp->id);
|
||||
llama->request_completion(resp->id, -1, data, false, false);
|
||||
llama->request_completion(resp->id, data, false, false, -1);
|
||||
} catch (std::exception &e) {
|
||||
snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
|
||||
} catch (...) {
|
||||
@@ -216,9 +212,9 @@ void llama_server_completion_next_result(const int task_id,
|
||||
std::string result_json;
|
||||
try {
|
||||
atomicRecv ar(recv_counter);
|
||||
server_task_result result = llama->queue_results.recv(task_id);
|
||||
task_result result = llama->queue_results.recv(task_id);
|
||||
result_json =
|
||||
result.data.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
resp->id = result.id;
|
||||
resp->stop = result.stop;
|
||||
resp->error = result.error;
|
||||
@@ -363,10 +359,10 @@ void llama_server_embedding(const char *json_req, char **json_resp,
|
||||
}
|
||||
const int task_id = llama->queue_tasks.get_new_id();
|
||||
llama->queue_results.add_waiting_task_id(task_id);
|
||||
llama->request_completion(task_id, -1, {{"prompt", prompt}, {"n_predict", 0}}, false, true);
|
||||
llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
|
||||
atomicRecv ar(recv_counter);
|
||||
server_task_result result = llama->queue_results.recv(task_id);
|
||||
std::string result_json = result.data.dump();
|
||||
task_result result = llama->queue_results.recv(task_id);
|
||||
std::string result_json = result.result_json.dump();
|
||||
const std::string::size_type size = result_json.size() + 1;
|
||||
*json_resp = new char[size];
|
||||
snprintf(*json_resp, size, "%s", result_json.c_str());
|
||||
|
||||
@@ -60,7 +60,7 @@ case "${GOARCH}" in
|
||||
compress_libs
|
||||
;;
|
||||
"arm64")
|
||||
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
|
||||
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
|
||||
build
|
||||
|
||||
@@ -185,7 +185,7 @@ if [ -d "${ROCM_PATH}" ]; then
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
|
||||
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
|
||||
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
|
||||
build
|
||||
|
||||
# Record the ROCM dependencies
|
||||
@@ -194,6 +194,12 @@ if [ -d "${ROCM_PATH}" ]; then
|
||||
for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
|
||||
echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt"
|
||||
done
|
||||
# bomb out if for some reason we didn't get a few deps
|
||||
if [ $(cat "${BUILD_DIR}/lib/deps.txt" | wc -l ) -lt 8 ] ; then
|
||||
cat "${BUILD_DIR}/lib/deps.txt"
|
||||
echo "ERROR: deps file short"
|
||||
exit 1
|
||||
fi
|
||||
compress_libs
|
||||
fi
|
||||
|
||||
|
||||
152
llm/ggla.go
Normal file
152
llm/ggla.go
Normal file
@@ -0,0 +1,152 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"io"
|
||||
"slices"
|
||||
)
|
||||
|
||||
type ContainerGGLA struct {
|
||||
version uint32
|
||||
}
|
||||
|
||||
func (c *ContainerGGLA) Name() string {
|
||||
return "ggla"
|
||||
}
|
||||
|
||||
func (c *ContainerGGLA) Decode(rso *readSeekOffset) (model, error) {
|
||||
binary.Read(rso, binary.LittleEndian, &c.version)
|
||||
|
||||
switch c.version {
|
||||
case 1:
|
||||
default:
|
||||
return nil, errors.New("invalid version")
|
||||
}
|
||||
|
||||
model := newModelGGLA(c)
|
||||
err := model.decode(rso)
|
||||
return model, err
|
||||
}
|
||||
|
||||
type ModelGGLA struct {
|
||||
*ContainerGGLA
|
||||
|
||||
kv KV
|
||||
tensors []Tensor
|
||||
}
|
||||
|
||||
func newModelGGLA(container *ContainerGGLA) *ModelGGLA {
|
||||
return &ModelGGLA{
|
||||
ContainerGGLA: container,
|
||||
kv: make(KV),
|
||||
}
|
||||
}
|
||||
|
||||
func (m *ModelGGLA) decode(rso *readSeekOffset) error {
|
||||
var r uint32
|
||||
if err := binary.Read(rso, binary.LittleEndian, &r); err != nil {
|
||||
return err
|
||||
}
|
||||
m.kv["r"] = r
|
||||
|
||||
var alpha uint32
|
||||
if err := binary.Read(rso, binary.LittleEndian, &alpha); err != nil {
|
||||
return err
|
||||
}
|
||||
m.kv["alpha"] = alpha
|
||||
|
||||
for {
|
||||
var dims uint32
|
||||
if err := binary.Read(rso, binary.LittleEndian, &dims); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var namesize uint32
|
||||
if err := binary.Read(rso, binary.LittleEndian, &namesize); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var t Tensor
|
||||
if err := binary.Read(rso, binary.LittleEndian, &t.Kind); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
t.Shape = make([]uint64, dims)
|
||||
for i := 0; uint32(i) < dims; i++ {
|
||||
var shape32 uint32
|
||||
if err := binary.Read(rso, binary.LittleEndian, &shape32); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
t.Shape[i] = uint64(shape32)
|
||||
}
|
||||
|
||||
// ggla tensor shape is reversed
|
||||
// ref: https://github.com/ggerganov/llama.cpp/blob/29ae62d2ae163e2b68aa0ad3bf2ab4636de0c957/convert-lora-to-ggml.py#L44
|
||||
slices.Reverse(t.Shape)
|
||||
|
||||
name := make([]byte, namesize)
|
||||
if err := binary.Read(rso, binary.LittleEndian, &name); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
t.Name = string(name)
|
||||
|
||||
if _, err := rso.Seek((rso.offset+31)&-32, io.SeekStart); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
t.Offset = uint64(rso.offset)
|
||||
|
||||
if _, err := rso.Seek(int64(t.Size()), io.SeekCurrent); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
m.tensors = append(m.tensors, t)
|
||||
}
|
||||
}
|
||||
|
||||
func (m *ModelGGLA) KV() KV {
|
||||
return m.kv
|
||||
}
|
||||
|
||||
func (m *ModelGGLA) Tensor() []Tensor {
|
||||
return m.tensors
|
||||
}
|
||||
|
||||
func (*ModelGGLA) ModelFamily() string {
|
||||
return "ggla"
|
||||
}
|
||||
|
||||
func (*ModelGGLA) ModelType() string {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (*ModelGGLA) FileType() string {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (*ModelGGLA) NumLayers() uint32 {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (*ModelGGLA) NumGQA() uint32 {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (*ModelGGLA) NumEmbed() uint32 {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (*ModelGGLA) NumHead() uint32 {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (*ModelGGLA) NumHeadKv() uint32 {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (*ModelGGLA) NumCtx() uint32 {
|
||||
panic("not implemented")
|
||||
}
|
||||
32
llm/ggml.go
32
llm/ggml.go
@@ -106,32 +106,6 @@ type container interface {
|
||||
Decode(*readSeekOffset) (model, error)
|
||||
}
|
||||
|
||||
type containerLORA struct {
|
||||
version uint32
|
||||
}
|
||||
|
||||
func (c *containerLORA) Name() string {
|
||||
return "ggla"
|
||||
}
|
||||
|
||||
func (c *containerLORA) Decode(rso *readSeekOffset) (model, error) {
|
||||
var version uint32
|
||||
binary.Read(rso, binary.LittleEndian, &version)
|
||||
|
||||
switch version {
|
||||
case 1:
|
||||
default:
|
||||
return nil, errors.New("invalid version")
|
||||
}
|
||||
|
||||
c.version = version
|
||||
|
||||
// remaining file contents aren't decoded
|
||||
rso.Seek(0, io.SeekEnd)
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
const (
|
||||
// Magic constant for `ggml` files (unversioned).
|
||||
FILE_MAGIC_GGML = 0x67676d6c
|
||||
@@ -161,7 +135,7 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
|
||||
case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
|
||||
return nil, ErrUnsupportedFormat
|
||||
case FILE_MAGIC_GGLA:
|
||||
c = &containerLORA{}
|
||||
c = &ContainerGGLA{}
|
||||
case FILE_MAGIC_GGUF_LE:
|
||||
c = &ContainerGGUF{ByteOrder: binary.LittleEndian}
|
||||
case FILE_MAGIC_GGUF_BE:
|
||||
@@ -171,7 +145,9 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
|
||||
}
|
||||
|
||||
model, err := c.Decode(&ro)
|
||||
if err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
// noop
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
||||
10
llm/gguf.go
10
llm/gguf.go
@@ -94,7 +94,7 @@ type Tensor struct {
|
||||
Offset uint64
|
||||
|
||||
// shape is the number of elements in each dimension
|
||||
Shape [4]uint64
|
||||
Shape []uint64
|
||||
|
||||
FileName string
|
||||
OffsetPadding uint64
|
||||
@@ -156,7 +156,11 @@ func (t Tensor) TypeSize() uint64 {
|
||||
}
|
||||
|
||||
func (t Tensor) Parameters() uint64 {
|
||||
return t.Shape[0] * t.Shape[1] * t.Shape[2] * t.Shape[3]
|
||||
var count uint64 = 1
|
||||
for _, n := range t.Shape {
|
||||
count *= n
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
func (t Tensor) Size() uint64 {
|
||||
@@ -703,7 +707,7 @@ func (llm *GGUFModel) Decode(rso *readSeekOffset) error {
|
||||
Name: name,
|
||||
Kind: llm.readU32(rso),
|
||||
Offset: llm.readU64(rso),
|
||||
Shape: shape,
|
||||
Shape: shape[:],
|
||||
}
|
||||
|
||||
llm.Tensors = append(llm.Tensors, tensor)
|
||||
|
||||
Submodule llm/llama.cpp updated: 6cdabe6526...ceca1aef07
32
llm/llm.go
32
llm/llm.go
@@ -6,6 +6,7 @@ import (
|
||||
"log/slog"
|
||||
"os"
|
||||
"runtime"
|
||||
"slices"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
"github.com/jmorganca/ollama/gpu"
|
||||
@@ -19,6 +20,10 @@ type LLM interface {
|
||||
Close()
|
||||
}
|
||||
|
||||
var cpuOnlyFamilies = []string{
|
||||
"mamba",
|
||||
}
|
||||
|
||||
func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
||||
if _, err := os.Stat(model); err != nil {
|
||||
return nil, err
|
||||
@@ -48,13 +53,18 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
|
||||
size := ggml.Size
|
||||
|
||||
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
|
||||
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
|
||||
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(max(ggml.NumHead(), 1))
|
||||
|
||||
// this amount is the overhead + tensors in memory
|
||||
// TODO: get this from the llama.cpp's graph calculations instead of
|
||||
// estimating it's 1/6 * kv_cache_size * num_gqa
|
||||
graph := int64(ggml.NumGQA()) * kv / 6
|
||||
|
||||
// certain model architectures don't support gpu inference yet
|
||||
if slices.Contains(cpuOnlyFamilies, ggml.ModelFamily()) {
|
||||
opts.NumGPU = 0
|
||||
}
|
||||
|
||||
info := gpu.GetGPUInfo()
|
||||
switch runtime.GOOS {
|
||||
case "darwin":
|
||||
@@ -63,9 +73,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
|
||||
}
|
||||
|
||||
if size+kv+graph > vram {
|
||||
slog.Info("not enough vram available, falling back to CPU only")
|
||||
info.Library = "cpu"
|
||||
info.Variant = gpu.GetCPUVariant()
|
||||
slog.Info("not enough vram available, setting num_gpu=0")
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
}
|
||||
@@ -143,15 +151,25 @@ func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []stri
|
||||
}
|
||||
}
|
||||
|
||||
err := fmt.Errorf("unable to locate suitable llm library")
|
||||
// We stage into a temp directory, and if we've been idle for a while, it may have been reaped
|
||||
_, err := os.Stat(dynLibs[0])
|
||||
if err != nil {
|
||||
slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
|
||||
err = nativeInit()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
err2 := fmt.Errorf("unable to locate suitable llm library")
|
||||
for _, dynLib := range dynLibs {
|
||||
srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
|
||||
if err == nil {
|
||||
return srv, nil
|
||||
}
|
||||
slog.Warn(fmt.Sprintf("Failed to load dynamic library %s %s", dynLib, err))
|
||||
err = err
|
||||
err2 = err
|
||||
}
|
||||
|
||||
return nil, err
|
||||
return nil, err2
|
||||
}
|
||||
|
||||
@@ -1,19 +1,21 @@
|
||||
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
|
||||
index f255ad76..914ecfdd 100644
|
||||
index 8fe5e0b1..3e82acb9 100644
|
||||
--- a/examples/server/server.cpp
|
||||
+++ b/examples/server/server.cpp
|
||||
@@ -1101,12 +1101,13 @@ struct server_context {
|
||||
@@ -997,13 +997,15 @@ struct llama_server_context
|
||||
slot.n_sent_text += result.text_to_send.size();
|
||||
// add the token to slot queue and cache
|
||||
}
|
||||
|
||||
- slot.add_token_string(result);
|
||||
if (slot.params.stream) {
|
||||
+
|
||||
if (slot.params.stream)
|
||||
{
|
||||
send_partial_response(slot, result);
|
||||
}
|
||||
}
|
||||
|
||||
+ slot.add_token_string(result);
|
||||
+
|
||||
if (incomplete) {
|
||||
if (incomplete)
|
||||
{
|
||||
slot.has_next_token = true;
|
||||
}
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
|
||||
index f255ad76..5b83acb1 100644
|
||||
index 8fe5e0b1..53bf39c1 100644
|
||||
--- a/examples/server/server.cpp
|
||||
+++ b/examples/server/server.cpp
|
||||
@@ -28,6 +28,10 @@
|
||||
#include <thread>
|
||||
@@ -31,6 +31,10 @@
|
||||
#include <atomic>
|
||||
#include <signal.h>
|
||||
|
||||
+#ifdef GGML_USE_CUBLAS
|
||||
@@ -12,8 +12,8 @@ index f255ad76..5b83acb1 100644
|
||||
+
|
||||
using json = nlohmann::json;
|
||||
|
||||
bool server_verbose = false;
|
||||
@@ -648,6 +652,10 @@ struct server_context {
|
||||
struct server_params {
|
||||
@@ -363,6 +367,10 @@ struct llama_server_context
|
||||
llama_free_model(model);
|
||||
model = nullptr;
|
||||
}
|
||||
@@ -23,8 +23,8 @@ index f255ad76..5b83acb1 100644
|
||||
+#endif
|
||||
}
|
||||
|
||||
bool load_model(const gpt_params & params_) {
|
||||
@@ -3339,6 +3347,7 @@ int main(int argc, char ** argv) {
|
||||
bool load_model(const gpt_params ¶ms_)
|
||||
@@ -3543,6 +3551,7 @@ int main(int argc, char **argv)
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
@@ -33,7 +33,7 @@ index f255ad76..5b83acb1 100644
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
|
||||
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
|
||||
index 72bcec8c..50a45e3d 100644
|
||||
index 72bcec8c..6c934e8c 100644
|
||||
--- a/ggml-cuda.cu
|
||||
+++ b/ggml-cuda.cu
|
||||
@@ -43,6 +43,7 @@
|
||||
@@ -76,11 +76,12 @@ index 72bcec8c..50a45e3d 100644
|
||||
g_cublas_loaded = true;
|
||||
}
|
||||
}
|
||||
@@ -12490,3 +12491,22 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
|
||||
@@ -12490,3 +12491,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
|
||||
}
|
||||
return device_count;
|
||||
}
|
||||
+
|
||||
+
|
||||
+extern "C" GGML_CALL void ggml_free_cublas(void);
|
||||
+GGML_CALL void ggml_free_cublas(void) {
|
||||
+ for (int id = 0; id < g_device_count; ++id) {
|
||||
@@ -99,6 +100,7 @@ index 72bcec8c..50a45e3d 100644
|
||||
+
|
||||
+ g_cublas_initialized = false;
|
||||
+}
|
||||
\ No newline at end of file
|
||||
diff --git a/ggml-cuda.h b/ggml-cuda.h
|
||||
index b1ebd61d..6dd58ddf 100644
|
||||
--- a/ggml-cuda.h
|
||||
|
||||
44
llm/patches/03-load_exception.diff
Normal file
44
llm/patches/03-load_exception.diff
Normal file
@@ -0,0 +1,44 @@
|
||||
diff --git a/llama.cpp b/llama.cpp
|
||||
index 4225f955..7b762f86 100644
|
||||
--- a/llama.cpp
|
||||
+++ b/llama.cpp
|
||||
@@ -4756,7 +4756,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
}
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
|
||||
- return -1;
|
||||
+ throw;
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -12102,16 +12102,22 @@ struct llama_model * llama_load_model_from_file(
|
||||
};
|
||||
}
|
||||
|
||||
- int status = llama_model_load(path_model, *model, params);
|
||||
- GGML_ASSERT(status <= 0);
|
||||
- if (status < 0) {
|
||||
- if (status == -1) {
|
||||
- LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
||||
- } else if (status == -2) {
|
||||
- LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
||||
+ try {
|
||||
+ int status = llama_model_load(path_model, *model, params);
|
||||
+ GGML_ASSERT(status <= 0);
|
||||
+ if (status < 0) {
|
||||
+ if (status == -1) {
|
||||
+ LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
||||
+ } else if (status == -2) {
|
||||
+ LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
||||
+ }
|
||||
+ delete model;
|
||||
+ return nullptr;
|
||||
}
|
||||
+ } catch (...) {
|
||||
+ LLAMA_LOG_ERROR("%s: exception loading model\n", __func__);
|
||||
delete model;
|
||||
- return nullptr;
|
||||
+ throw;
|
||||
}
|
||||
|
||||
return model;
|
||||
13
llm/patches/04-locale.diff
Normal file
13
llm/patches/04-locale.diff
Normal file
@@ -0,0 +1,13 @@
|
||||
diff --git a/llama.cpp b/llama.cpp
|
||||
index b27aa272..99372f9c 100644
|
||||
--- a/llama.cpp
|
||||
+++ b/llama.cpp
|
||||
@@ -9360,7 +9360,7 @@ struct llm_tokenizer_wpm {
|
||||
}
|
||||
|
||||
uint32_t to_lower(uint32_t code) {
|
||||
- static const std::locale locale("en_US.UTF-8");
|
||||
+ static const std::locale locale("");
|
||||
#if defined(_WIN32)
|
||||
if (code > 0xFFFF) {
|
||||
return code;
|
||||
@@ -104,31 +104,14 @@ func rocmDynLibPresent() bool {
|
||||
}
|
||||
|
||||
func nativeInit() error {
|
||||
slog.Info("Extracting dynamic libraries...")
|
||||
assetsDir, err := gpu.AssetsDir()
|
||||
payloadsDir, err := gpu.PayloadsDir()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// delete the assetsDir
|
||||
if err := os.RemoveAll(assetsDir); err != nil {
|
||||
return err
|
||||
}
|
||||
slog.Info(fmt.Sprintf("Extracting dynamic libraries to %s ...", payloadsDir))
|
||||
|
||||
if runtime.GOOS == "darwin" {
|
||||
err := extractPayloadFiles(assetsDir, "llama.cpp/ggml-metal.metal")
|
||||
if err != nil {
|
||||
if err == payloadMissing {
|
||||
// TODO perhaps consider this a hard failure on arm macs?
|
||||
slog.Info("ggml-meta.metal payload missing")
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
os.Setenv("GGML_METAL_PATH_RESOURCES", assetsDir)
|
||||
}
|
||||
|
||||
libs, err := extractDynamicLibs(assetsDir, "llama.cpp/build/*/*/*/lib/*")
|
||||
libs, err := extractDynamicLibs(payloadsDir, "llama.cpp/build/*/*/*/lib/*")
|
||||
if err != nil {
|
||||
if err == payloadMissing {
|
||||
slog.Info(fmt.Sprintf("%s", payloadMissing))
|
||||
@@ -159,7 +142,7 @@ func nativeInit() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func extractDynamicLibs(assetsDir, glob string) ([]string, error) {
|
||||
func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {
|
||||
files, err := fs.Glob(libEmbed, glob)
|
||||
if err != nil || len(files) == 0 {
|
||||
return nil, payloadMissing
|
||||
@@ -178,14 +161,14 @@ func extractDynamicLibs(assetsDir, glob string) ([]string, error) {
|
||||
g.Go(func() error {
|
||||
// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
|
||||
// Include the variant in the path to avoid conflicts between multiple server libs
|
||||
targetDir := filepath.Join(assetsDir, pathComps[pathComponentCount-3])
|
||||
targetDir := filepath.Join(payloadsDir, pathComps[pathComponentCount-3])
|
||||
srcFile, err := libEmbed.Open(file)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read payload %s: %v", file, err)
|
||||
}
|
||||
defer srcFile.Close()
|
||||
if err := os.MkdirAll(targetDir, 0o755); err != nil {
|
||||
return fmt.Errorf("create payload lib dir %s: %v", assetsDir, err)
|
||||
return fmt.Errorf("create payload lib dir %s: %v", payloadsDir, err)
|
||||
}
|
||||
src := io.Reader(srcFile)
|
||||
filename := file
|
||||
@@ -216,52 +199,6 @@ func extractDynamicLibs(assetsDir, glob string) ([]string, error) {
|
||||
return libs, g.Wait()
|
||||
}
|
||||
|
||||
func extractPayloadFiles(assetsDir, glob string) error {
|
||||
files, err := fs.Glob(libEmbed, glob)
|
||||
if err != nil || len(files) == 0 {
|
||||
return payloadMissing
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
srcFile, err := libEmbed.Open(file)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read payload %s: %v", file, err)
|
||||
}
|
||||
defer srcFile.Close()
|
||||
if err := os.MkdirAll(assetsDir, 0o755); err != nil {
|
||||
return fmt.Errorf("create payload lib dir %s: %v", assetsDir, err)
|
||||
}
|
||||
src := io.Reader(srcFile)
|
||||
filename := file
|
||||
if strings.HasSuffix(file, ".gz") {
|
||||
src, err = gzip.NewReader(src)
|
||||
if err != nil {
|
||||
return fmt.Errorf("decompress payload %s: %v", file, err)
|
||||
}
|
||||
filename = strings.TrimSuffix(filename, ".gz")
|
||||
}
|
||||
|
||||
destFile := filepath.Join(assetsDir, filepath.Base(filename))
|
||||
_, err = os.Stat(destFile)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
return fmt.Errorf("write payload %s: %v", file, err)
|
||||
}
|
||||
defer destFp.Close()
|
||||
if _, err := io.Copy(destFp, src); err != nil {
|
||||
return fmt.Errorf("copy payload %s: %v", file, err)
|
||||
}
|
||||
case err != nil:
|
||||
return fmt.Errorf("stat payload %s: %v", file, err)
|
||||
case err == nil:
|
||||
slog.Debug("payload already exists: " + destFile)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func verifyDriverAccess() error {
|
||||
if runtime.GOOS != "linux" {
|
||||
return nil
|
||||
|
||||
@@ -4,5 +4,5 @@ import (
|
||||
"embed"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
|
||||
//go:embed llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
|
||||
var libEmbed embed.FS
|
||||
|
||||
@@ -19,10 +19,9 @@ type Buffer struct {
|
||||
|
||||
func NewBuffer(prompt *Prompt) (*Buffer, error) {
|
||||
fd := int(os.Stdout.Fd())
|
||||
width, height, err := term.GetSize(fd)
|
||||
if err != nil {
|
||||
fmt.Println("Error getting size:", err)
|
||||
return nil, err
|
||||
width, height := 80, 24
|
||||
if termWidth, termHeight, err := term.GetSize(fd); err == nil {
|
||||
width, height = termWidth, termHeight
|
||||
}
|
||||
|
||||
lwidth := width - len(prompt.prompt())
|
||||
|
||||
@@ -10,8 +10,8 @@ mkdir -p dist
|
||||
for TARGETARCH in arm64 amd64; do
|
||||
rm -rf llm/llama.cpp/build
|
||||
GOOS=darwin GOARCH=$TARGETARCH go generate ./...
|
||||
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH
|
||||
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -cover -o dist/ollama-darwin-$TARGETARCH-cov
|
||||
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -o dist/ollama-darwin-$TARGETARCH
|
||||
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -cover -o dist/ollama-darwin-$TARGETARCH-cov
|
||||
done
|
||||
|
||||
lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
|
||||
|
||||
@@ -22,6 +22,10 @@ for TARGETARCH in ${BUILD_ARCH}; do
|
||||
.
|
||||
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
|
||||
docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
|
||||
docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/dist/deps/ ./dist/
|
||||
|
||||
if [ "$TARGETARCH" = "amd64" ]; then
|
||||
docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/dist/deps/ ./dist/
|
||||
fi
|
||||
|
||||
docker rm builder-$TARGETARCH
|
||||
done
|
||||
|
||||
@@ -53,7 +53,7 @@ function buildOllama() {
|
||||
write-host "Building ollama CLI"
|
||||
& go generate ./...
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
& go build -ldflags "-s -w -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
|
||||
& go build -trimpath -ldflags "-s -w -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
if ("${env:KEY_CONTAINER}") {
|
||||
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
|
||||
@@ -68,7 +68,7 @@ function buildApp() {
|
||||
write-host "Building Ollama App"
|
||||
cd "${script:SRC_DIR}\app"
|
||||
& windres -l 0 -o ollama.syso ollama.rc
|
||||
& go build -ldflags "-s -w -H windowsgui -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
|
||||
& go build -trimpath -ldflags "-s -w -H windowsgui -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
if ("${env:KEY_CONTAINER}") {
|
||||
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"log"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
@@ -322,9 +323,12 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
|
||||
|
||||
ggufName, err := convertSafetensors(name, pathName)
|
||||
if err != nil {
|
||||
var pathErr *fs.PathError
|
||||
switch {
|
||||
case errors.Is(err, zip.ErrFormat):
|
||||
// it's not a safetensor archive
|
||||
case errors.As(err, &pathErr):
|
||||
// it's not a file on disk, could be a model reference
|
||||
default:
|
||||
return err
|
||||
}
|
||||
@@ -469,7 +473,13 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
|
||||
}
|
||||
defer bin.Close()
|
||||
|
||||
layer, err := NewLayer(bin, mediatype)
|
||||
ggml, err := llm.DecodeGGML(bin)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
sr := io.NewSectionReader(bin, 0, ggml.Size)
|
||||
layer, err := NewLayer(sr, mediatype)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -785,10 +795,8 @@ func PruneLayers() error {
|
||||
|
||||
for _, blob := range blobs {
|
||||
name := blob.Name()
|
||||
if runtime.GOOS == "windows" {
|
||||
name = strings.ReplaceAll(name, "-", ":")
|
||||
}
|
||||
if strings.HasPrefix(name, "sha256:") {
|
||||
name = strings.ReplaceAll(name, "-", ":")
|
||||
if strings.HasPrefix(name, "sha256-") {
|
||||
deleteMap[name] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"runtime"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/exp/slices"
|
||||
@@ -47,10 +46,7 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
delimiter := ":"
|
||||
if runtime.GOOS == "windows" {
|
||||
delimiter = "-"
|
||||
}
|
||||
const delimiter = "-"
|
||||
|
||||
pattern := strings.Join([]string{"sha256", "*-partial"}, delimiter)
|
||||
temp, err := os.CreateTemp(blobs, pattern)
|
||||
|
||||
@@ -6,7 +6,6 @@ import (
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
)
|
||||
|
||||
@@ -150,10 +149,7 @@ func GetBlobsPath(digest string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if runtime.GOOS == "windows" {
|
||||
digest = strings.ReplaceAll(digest, ":", "-")
|
||||
}
|
||||
|
||||
digest = strings.ReplaceAll(digest, ":", "-")
|
||||
path := filepath.Join(dir, "blobs", digest)
|
||||
dirPath := filepath.Dir(path)
|
||||
if digest == "" {
|
||||
|
||||
131
server/routes.go
131
server/routes.go
@@ -8,13 +8,16 @@ import (
|
||||
"io"
|
||||
"io/fs"
|
||||
"log/slog"
|
||||
"math"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/netip"
|
||||
"os"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
@@ -35,7 +38,7 @@ import (
|
||||
var mode string = gin.DebugMode
|
||||
|
||||
type Server struct {
|
||||
WorkDir string
|
||||
addr net.Addr
|
||||
}
|
||||
|
||||
func init() {
|
||||
@@ -206,7 +209,7 @@ func GenerateHandler(c *gin.Context) {
|
||||
|
||||
var sessionDuration time.Duration
|
||||
if req.KeepAlive == nil {
|
||||
sessionDuration = defaultSessionDuration
|
||||
sessionDuration = getDefaultSessionDuration()
|
||||
} else {
|
||||
sessionDuration = req.KeepAlive.Duration
|
||||
}
|
||||
@@ -383,6 +386,32 @@ func GenerateHandler(c *gin.Context) {
|
||||
streamResponse(c, ch)
|
||||
}
|
||||
|
||||
func getDefaultSessionDuration() time.Duration {
|
||||
if t, exists := os.LookupEnv("OLLAMA_KEEP_ALIVE"); exists {
|
||||
v, err := strconv.Atoi(t)
|
||||
if err != nil {
|
||||
d, err := time.ParseDuration(t)
|
||||
if err != nil {
|
||||
return defaultSessionDuration
|
||||
}
|
||||
|
||||
if d < 0 {
|
||||
return time.Duration(math.MaxInt64)
|
||||
}
|
||||
|
||||
return d
|
||||
}
|
||||
|
||||
d := time.Duration(v) * time.Second
|
||||
if d < 0 {
|
||||
return time.Duration(math.MaxInt64)
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
return defaultSessionDuration
|
||||
}
|
||||
|
||||
func EmbeddingsHandler(c *gin.Context) {
|
||||
loaded.mu.Lock()
|
||||
defer loaded.mu.Unlock()
|
||||
@@ -426,7 +455,7 @@ func EmbeddingsHandler(c *gin.Context) {
|
||||
|
||||
var sessionDuration time.Duration
|
||||
if req.KeepAlive == nil {
|
||||
sessionDuration = defaultSessionDuration
|
||||
sessionDuration = getDefaultSessionDuration()
|
||||
} else {
|
||||
sessionDuration = req.KeepAlive.Duration
|
||||
}
|
||||
@@ -904,15 +933,83 @@ var defaultAllowOrigins = []string{
|
||||
"0.0.0.0",
|
||||
}
|
||||
|
||||
func NewServer() (*Server, error) {
|
||||
workDir, err := os.MkdirTemp("", "ollama")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
func isLocalIP(ip netip.Addr) bool {
|
||||
if interfaces, err := net.Interfaces(); err == nil {
|
||||
for _, iface := range interfaces {
|
||||
addrs, err := iface.Addrs()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
for _, a := range addrs {
|
||||
if parsed, _, err := net.ParseCIDR(a.String()); err == nil {
|
||||
if parsed.String() == ip.String() {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return &Server{
|
||||
WorkDir: workDir,
|
||||
}, nil
|
||||
return false
|
||||
}
|
||||
|
||||
func allowedHost(host string) bool {
|
||||
if host == "" || host == "localhost" {
|
||||
return true
|
||||
}
|
||||
|
||||
if hostname, err := os.Hostname(); err == nil && host == hostname {
|
||||
return true
|
||||
}
|
||||
|
||||
var tlds = []string{
|
||||
"localhost",
|
||||
"local",
|
||||
"internal",
|
||||
}
|
||||
|
||||
// check if the host is a local TLD
|
||||
for _, tld := range tlds {
|
||||
if strings.HasSuffix(host, "."+tld) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func allowedHostsMiddleware(addr net.Addr) gin.HandlerFunc {
|
||||
return func(c *gin.Context) {
|
||||
if addr == nil {
|
||||
c.Next()
|
||||
return
|
||||
}
|
||||
|
||||
if addr, err := netip.ParseAddrPort(addr.String()); err == nil && !addr.Addr().IsLoopback() {
|
||||
c.Next()
|
||||
return
|
||||
}
|
||||
|
||||
host, _, err := net.SplitHostPort(c.Request.Host)
|
||||
if err != nil {
|
||||
host = c.Request.Host
|
||||
}
|
||||
|
||||
if addr, err := netip.ParseAddr(host); err == nil {
|
||||
if addr.IsLoopback() || addr.IsPrivate() || addr.IsUnspecified() || isLocalIP(addr) {
|
||||
c.Next()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if allowedHost(host) {
|
||||
c.Next()
|
||||
return
|
||||
}
|
||||
|
||||
c.AbortWithStatus(http.StatusForbidden)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) GenerateRoutes() http.Handler {
|
||||
@@ -938,10 +1035,7 @@ func (s *Server) GenerateRoutes() http.Handler {
|
||||
r := gin.Default()
|
||||
r.Use(
|
||||
cors.New(config),
|
||||
func(c *gin.Context) {
|
||||
c.Set("workDir", s.WorkDir)
|
||||
c.Next()
|
||||
},
|
||||
allowedHostsMiddleware(s.addr),
|
||||
)
|
||||
|
||||
r.POST("/api/pull", PullModelHandler)
|
||||
@@ -1010,10 +1104,7 @@ func Serve(ln net.Listener) error {
|
||||
}
|
||||
}
|
||||
|
||||
s, err := NewServer()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s := &Server{addr: ln.Addr()}
|
||||
r := s.GenerateRoutes()
|
||||
|
||||
slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
|
||||
@@ -1029,7 +1120,7 @@ func Serve(ln net.Listener) error {
|
||||
if loaded.runner != nil {
|
||||
loaded.runner.Close()
|
||||
}
|
||||
os.RemoveAll(s.WorkDir)
|
||||
gpu.Cleanup()
|
||||
os.Exit(0)
|
||||
}()
|
||||
|
||||
@@ -1165,7 +1256,7 @@ func ChatHandler(c *gin.Context) {
|
||||
|
||||
var sessionDuration time.Duration
|
||||
if req.KeepAlive == nil {
|
||||
sessionDuration = defaultSessionDuration
|
||||
sessionDuration = getDefaultSessionDuration()
|
||||
} else {
|
||||
sessionDuration = req.KeepAlive.Duration
|
||||
}
|
||||
|
||||
@@ -21,12 +21,6 @@ import (
|
||||
"github.com/jmorganca/ollama/version"
|
||||
)
|
||||
|
||||
func setupServer(t *testing.T) (*Server, error) {
|
||||
t.Helper()
|
||||
|
||||
return NewServer()
|
||||
}
|
||||
|
||||
func Test_Routes(t *testing.T) {
|
||||
type testCase struct {
|
||||
Name string
|
||||
@@ -207,9 +201,7 @@ func Test_Routes(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
s, err := setupServer(t)
|
||||
assert.Nil(t, err)
|
||||
|
||||
s := Server{}
|
||||
router := s.GenerateRoutes()
|
||||
|
||||
httpSrv := httptest.NewServer(router)
|
||||
|
||||
Reference in New Issue
Block a user