Compare commits

...

25 Commits

Author SHA1 Message Date
Blake Mizerany
d9ea2e5c7a inital pass at replacing ":" with "-" in image on-disk names 2024-03-14 11:30:06 -07:00
Jeffrey Morgan
5ce997a7b9 Update README.md 2024-03-13 21:12:17 -07:00
Jeffrey Morgan
672ffe9b7d add OLLAMA_KEEP_ALIVE to environment variable docs for ollama serve (#3127) 2024-03-13 14:35:33 -07:00
Patrick Devine
47cfe58af5 Default Keep Alive environment variable (#3094)
---------

Co-authored-by: Chris-AS1 <8493773+Chris-AS1@users.noreply.github.com>
2024-03-13 13:29:40 -07:00
Jeffrey Morgan
e72c567cfd restore locale patch (#3091) 2024-03-12 22:08:13 -07:00
Bruce MacDonald
3e22611200 token repeat limit for prediction requests (#3080) 2024-03-12 22:08:25 -04:00
Daniel Hiltgen
a54d4a28dc Merge pull request #3088 from dhiltgen/rocm_igpu_linux
Fix iGPU detection for linux
2024-03-12 17:20:27 -07:00
Daniel Hiltgen
82b0c7c27e Fix iGPU detection for linux
This fixes a few bugs in the new sysfs discovery logic.  iGPUs are now
correctly identified by their <1G VRAM reported.  the sysfs IDs are off
by one compared to what HIP wants due to the CPU being reported
in amdgpu, but HIP only cares about GPUs.
2024-03-12 16:57:19 -07:00
Patrick Devine
ba7cf7fb66 add more docs on for the modelfile message command (#3087) 2024-03-12 16:41:41 -07:00
Bruce MacDonald
2f804068bd warn when json format is expected but not mentioned in prompt (#3081) 2024-03-12 19:07:11 -04:00
Daniel Hiltgen
34d00f90b1 Merge pull request #3070 from dhiltgen/visible_devices
Add docs explaining GPU selection env vars
2024-03-12 11:36:46 -07:00
Daniel Hiltgen
b53229a2ed Add docs explaining GPU selection env vars 2024-03-12 11:33:06 -07:00
racerole
53c107e20e chore: fix typo (#3073)
Signed-off-by: racerole <jiangyifeng@outlook.com>
2024-03-12 14:09:22 -04:00
mofanke
51578d8573 fix gpu_info_cuda.c compile warning (#3077) 2024-03-12 14:08:40 -04:00
Jeffrey Morgan
b5fcd9d3aa use -trimpath when building releases (#3069) 2024-03-11 15:58:46 -07:00
Bruce MacDonald
b80661e8c7 relay load model errors to the client (#3065) 2024-03-11 16:48:27 -04:00
Jeffrey Morgan
6d3adfbea2 Update troubleshooting.md 2024-03-11 13:22:28 -07:00
Jeffrey Morgan
369eda65f5 update llama.cpp submodule to ceca1ae (#3064) 2024-03-11 12:57:48 -07:00
Michael Yang
f878e91070 Merge pull request #3044 from ollama/mxyng/fix-convert-shape
convert: fix shape
2024-03-11 09:56:57 -07:00
Daniel Hiltgen
0d651478e4 Merge pull request #3056 from dhiltgen/rocm_link_clash
Avoid rocm runner and dependency clash
2024-03-11 09:48:48 -07:00
Michael Yang
9ea492f1ce convert: fix shape 2024-03-11 09:41:01 -07:00
Daniel Hiltgen
bc13da2bfe Avoid rocm runner and dependency clash
Putting the rocm symlink next to the runners is risky.  This moves
the payloads into a subdir to avoid potential clashes.
2024-03-11 09:33:22 -07:00
Jeffrey Morgan
41b00b9856 fix 03-locale.diff 2024-03-10 16:21:05 -07:00
Daniel Hiltgen
c2a8ed48e7 Merge pull request #3048 from dhiltgen/harden_rocm_deps
Harden for deps file being empty (or short)
2024-03-10 15:17:22 -07:00
Daniel Hiltgen
3dc1bb6a35 Harden for deps file being empty (or short) 2024-03-10 14:45:38 -07:00
27 changed files with 293 additions and 127 deletions

View File

@@ -92,7 +92,7 @@ COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/b
COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/dist/deps/ ./dist/deps/
ARG GOFLAGS
ARG CGO_CFLAGS
RUN go build .
RUN go build -trimpath .
# Intermediate stage used for ./scripts/build_linux.sh
FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
@@ -103,7 +103,7 @@ COPY . .
COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
ARG GOFLAGS
ARG CGO_CFLAGS
RUN go build .
RUN go build -trimpath .
# Runtime stages
FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64

50
api/types_test.go Normal file
View File

@@ -0,0 +1,50 @@
package api
import (
"encoding/json"
"math"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestKeepAliveParsingFromJSON(t *testing.T) {
tests := []struct {
name string
req string
exp *Duration
}{
{
name: "Positive Integer",
req: `{ "keep_alive": 42 }`,
exp: &Duration{42 * time.Second},
},
{
name: "Positive Integer String",
req: `{ "keep_alive": "42m" }`,
exp: &Duration{42 * time.Minute},
},
{
name: "Negative Integer",
req: `{ "keep_alive": -1 }`,
exp: &Duration{math.MaxInt64},
},
{
name: "Negative Integer String",
req: `{ "keep_alive": "-1m" }`,
exp: &Duration{math.MaxInt64},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
var dec ChatRequest
err := json.Unmarshal([]byte(test.req), &dec)
require.NoError(t, err)
assert.Equal(t, test.exp, dec.KeepAlive)
})
}
}

View File

@@ -970,9 +970,10 @@ func NewCLI() *cobra.Command {
serveCmd.SetUsageTemplate(serveCmd.UsageTemplate() + `
Environment Variables:
OLLAMA_HOST The host:port to bind to (default "127.0.0.1:11434")
OLLAMA_ORIGINS A comma separated list of allowed origins.
OLLAMA_MODELS The path to the models directory (default is "~/.ollama/models")
OLLAMA_HOST The host:port to bind to (default "127.0.0.1:11434")
OLLAMA_ORIGINS A comma separated list of allowed origins.
OLLAMA_MODELS The path to the models directory (default is "~/.ollama/models")
OLLAMA_KEEP_ALIVE The duration that models stay loaded in memory (default is "5m")
`)
pullCmd := &cobra.Command{

View File

@@ -103,9 +103,9 @@ func ReadSafeTensors(fn string, offset uint64) ([]llm.Tensor, uint64, error) {
return []llm.Tensor{}, 0, err
}
shape := [4]uint64{1, 1, 1, 1}
for cnt, s := range data.Shape {
shape[cnt] = uint64(s)
shape := []uint64{0, 0, 0, 0}
for i := range data.Shape {
shape[i] = uint64(data.Shape[i])
}
t := llm.Tensor{

View File

@@ -3,7 +3,7 @@
### Getting Started
* [Quickstart](../README.md#quickstart)
* [Examples](../examples)
* [Importing models](./import.md) from GGUF, Pytorch and Safetensors
* [Importing models](./import.md)
* [Linux Documentation](./linux.md)
* [Windows Documentation](./windows.md)
* [Docker Documentation](https://hub.docker.com/r/ollama/ollama)

View File

@@ -193,3 +193,13 @@ To unload the model and free up memory use:
```shell
curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": 0}'
```
## Controlling which GPUs to use
By default, on Linux and Windows, Ollama will attempt to use Nvidia GPUs, or
Radeon GPUs, and will use all the GPUs it can find. You can limit which GPUs
will be utilized by setting the environment variable `CUDA_VISIBLE_DEVICES` for
NVIDIA cards, or `HIP_VISIBLE_DEVICES` for Radeon GPUs to a comma delimited list
of GPU IDs. You can see the list of devices with GPU tools such as `nvidia-smi` or
`rocminfo`. You can set to an invalid GPU ID (e.g., "-1") to bypass the GPU and
fallback to CPU.

View File

@@ -131,7 +131,7 @@ The `PARAMETER` instruction defines a parameter that can be set when the model i
PARAMETER <parameter> <parametervalue>
```
### Valid Parameters and Values
#### Valid Parameters and Values
| Parameter | Description | Value Type | Example Usage |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
@@ -201,7 +201,22 @@ LICENSE """
### MESSAGE
The `MESSAGE` instruction allows you to specify a message history for the model to use when responding:
The `MESSAGE` instruction allows you to specify a message history for the model to use when responding. Use multiple iterations of the MESSAGE command to build up a conversation which will guide the model to answer in a similar way.
```modelfile
MESSAGE <role> <message>
```
#### Valid roles
| Role | Description |
| --------- | ------------------------------------------------------------ |
| system | Alternate way of providing the SYSTEM message for the model. |
| user | An example message of what the user could have asked. |
| assistant | An example message of how the model should respond. |
#### Example conversation
```modelfile
MESSAGE user Is Toronto in Canada?
@@ -212,6 +227,7 @@ MESSAGE user Is Ontario in Canada?
MESSAGE assistant yes
```
## Notes
- the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.

View File

@@ -109,7 +109,3 @@ which version to install.
```sh
curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.27" sh
```
## Known issues
* N/A

View File

@@ -40,19 +40,17 @@ func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
// TODO - does sort order matter?
devices := []string{}
for i := range ids {
slog.Debug(fmt.Sprintf("i=%d", i))
if _, skipped := skip[i]; skipped {
slog.Debug("skipped")
continue
}
devices = append(devices, strconv.Itoa(i))
}
slog.Debug(fmt.Sprintf("devices=%v", devices))
val := strings.Join(devices, ",")
err := os.Setenv("HIP_VISIBLE_DEVICES", val)
if err != nil {
slog.Warn(fmt.Sprintf("failed to set env: %s", err))
} else {
slog.Info("Setting HIP_VISIBLE_DEVICES=" + val)
}
slog.Debug("HIP_VISIBLE_DEVICES=" + val)
}

View File

@@ -24,6 +24,9 @@ const (
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
RocmStandardLocation = "/opt/rocm/lib"
// TODO find a better way to detect iGPU instead of minimum memory
IGPUMemLimit = 1024 * 1024 * 1024 // 512G is what they typically report, so anything less than 1G must be iGPU
)
var (
@@ -146,8 +149,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
resp.memInfo.DeviceCount = 0
resp.memInfo.TotalMemory = 0
resp.memInfo.FreeMemory = 0
slog.Debug("discovering VRAM for amdgpu devices")
if len(ids) == 0 {
slog.Debug("discovering all amdgpu devices")
entries, err := os.ReadDir(AMDNodesSysfsDir)
if err != nil {
slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
@@ -165,7 +168,7 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
ids = append(ids, id)
}
}
slog.Debug(fmt.Sprintf("discovering amdgpu devices %v", ids))
slog.Debug(fmt.Sprintf("amdgpu devices %v", ids))
for _, id := range ids {
if _, skipped := skip[id]; skipped {
@@ -173,7 +176,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
}
totalMemory := uint64(0)
usedMemory := uint64(0)
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUTotalMemoryFileGlob)
// Adjust for sysfs vs HIP ids
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id+1), GPUTotalMemoryFileGlob)
propFiles, err := filepath.Glob(propGlob)
if err != nil {
slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
@@ -205,6 +209,13 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
}
}
if totalMemory == 0 {
slog.Warn(fmt.Sprintf("amdgpu [%d] reports zero total memory, skipping", id))
skip[id] = struct{}{}
continue
}
if totalMemory < IGPUMemLimit {
slog.Info(fmt.Sprintf("amdgpu [%d] appears to be an iGPU with %dM reported total memory, skipping", id, totalMemory/1024/1024))
skip[id] = struct{}{}
continue
}
usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
@@ -232,8 +243,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
}
usedMemory += used
}
slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %d", id, totalMemory))
slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory %d", id, (totalMemory - usedMemory)))
slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %dM", id, totalMemory/1024/1024))
slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory %dM", id, (totalMemory-usedMemory)/1024/1024))
resp.memInfo.DeviceCount++
resp.memInfo.TotalMemory += totalMemory
resp.memInfo.FreeMemory += (totalMemory - usedMemory)
@@ -282,7 +293,7 @@ func AMDValidateLibDir() (string, error) {
}
// If we already have a rocm dependency wired, nothing more to do
rocmTargetDir := filepath.Join(payloadsDir, "rocm")
rocmTargetDir := filepath.Clean(filepath.Join(payloadsDir, "..", "rocm"))
if rocmLibUsable(rocmTargetDir) {
return rocmTargetDir, nil
}
@@ -358,6 +369,8 @@ func AMDDriverVersion() (string, error) {
}
func AMDGFXVersions() map[int]Version {
// The amdgpu driver always exposes the host CPU as node 0, but we have to skip that and subtract one
// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
res := map[int]Version{}
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
for _, match := range matches {
@@ -373,17 +386,20 @@ func AMDGFXVersions() map[int]Version {
continue
}
if i == 0 {
// Skipping the CPU
continue
}
// Align with HIP IDs (zero is first GPU, not CPU)
i -= 1
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(line, "gfx_target_version") {
ver := strings.Fields(line)
if len(ver) != 2 || len(ver[1]) < 5 {
if ver[1] == "0" {
// Silently skip the CPU
continue
} else {
if ver[1] != "0" {
slog.Debug("malformed " + line)
}
res[i] = Version{

View File

@@ -23,7 +23,9 @@ func PayloadsDir() (string, error) {
if err != nil {
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
}
payloadsDir = tmpDir
// We create a distinct subdirectory for payloads within the tmpdir
// This will typically look like /tmp/ollama3208993108/runners on linux
payloadsDir = filepath.Join(tmpDir, "runners")
}
return payloadsDir, nil
}
@@ -32,10 +34,12 @@ func Cleanup() {
lock.Lock()
defer lock.Unlock()
if payloadsDir != "" {
slog.Debug("cleaning up", "dir", payloadsDir)
err := os.RemoveAll(payloadsDir)
// We want to fully clean up the tmpdir parent of the payloads dir
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
slog.Debug("cleaning up", "dir", tmpDir)
err := os.RemoveAll(tmpDir)
if err != nil {
slog.Warn("failed to clean up", "dir", payloadsDir, "err", err)
slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
}
}
}

View File

@@ -155,8 +155,8 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
}
}
LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.used);
LOG(h.verbose, "[%d] CUDA totalMem %llu\n", i, memInfo.total);
LOG(h.verbose, "[%d] CUDA usedMem %llu\n", i, memInfo.used);
resp->total += memInfo.total;
resp->free += memInfo.free;

View File

@@ -149,7 +149,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
slog.Info("Initializing llama server")
slog.Debug(fmt.Sprintf("server params: %+v", sparams))
initResp := newExtServerResp(128)
initResp := newExtServerResp(512)
defer freeExtServerResp(initResp)
C.dyn_llama_server_init(llm.s, &sparams, &initResp)
if initResp.id < 0 {
@@ -198,6 +198,9 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
if predict.Format == "json" {
request["grammar"] = jsonGrammar
if !strings.Contains(strings.ToLower(predict.Prompt), "json") {
slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
}
}
retryDelay := 100 * time.Microsecond
@@ -225,17 +228,14 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
}
retryNeeded := false
// keep track of the last token generated, this is used to abort if the model starts looping
var lastToken string
var tokenRepeat int
out:
for {
select {
case <-ctx.Done():
// This handles the request cancellation
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
} else {
return nil
}
return cancelCompletion(llm, resp)
default:
var result C.ext_server_task_result_t
C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
@@ -258,6 +258,20 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
break out
}
switch {
case strings.TrimSpace(p.Content) == lastToken:
tokenRepeat++
default:
lastToken = strings.TrimSpace(p.Content)
tokenRepeat = 0
}
// 30 picked as an arbitrary max token repeat limit, modify as needed
if tokenRepeat > 30 {
slog.Debug("prediction aborted, token repeat limit reached")
return cancelCompletion(llm, resp)
}
if p.Content != "" {
fn(PredictResult{
Content: p.Content,
@@ -285,6 +299,15 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
return fmt.Errorf("max retries exceeded")
}
func cancelCompletion(llm *dynExtServer, resp C.ext_server_resp_t) error {
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
} else {
return nil
}
}
func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
data, err := json.Marshal(TokenizeRequest{Content: prompt})
if err != nil {

View File

@@ -26,7 +26,7 @@
#endif // GGML_USE_CUBLAS
// Expose the llama server as a callable extern "C" API
server_context *llama = NULL;
llama_server_context *llama = NULL;
std::thread ext_server_thread;
bool shutting_down = false;
std::atomic_int recv_counter;
@@ -57,7 +57,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
err->id = 0;
err->msg[0] = '\0';
try {
llama = new server_context;
llama = new llama_server_context;
gpt_params params;
params.n_ctx = sparams->n_ctx;
params.n_batch = sparams->n_batch;
@@ -114,18 +114,14 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
llama_backend_init();
llama_numa_init(params.numa);
// load the model
if (!llama->load_model(params)) {
// TODO - consider modifying the logging logic or patching load_model so
// we can capture more detailed error messages and pass them back to the
// caller for better UX
err->id = -1;
snprintf(err->msg, err->msg_len, "error loading model %s",
params.model.c_str());
return;
}
if (!llama->load_model(params)) {
// an error occurred that was not thrown
err->id = -1;
snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
return;
}
llama->init();
llama->initialize();
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
@@ -144,13 +140,13 @@ void llama_server_start() {
LOG_TEE("llama server main loop starting\n");
ggml_time_init();
llama->queue_tasks.on_new_task(std::bind(
&server_context::process_single_task, llama, std::placeholders::_1));
&llama_server_context::process_single_task, llama, std::placeholders::_1));
llama->queue_tasks.on_finish_multitask(std::bind(
&server_context::on_finish_multitask, llama, std::placeholders::_1));
&llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
llama->queue_tasks.on_run_slots(std::bind(
&server_context::update_slots, llama));
&llama_server_context::update_slots, llama));
llama->queue_results.on_multitask_update(std::bind(
&server_queue::update_multitask,
&llama_server_queue::update_multitask,
&llama->queue_tasks,
std::placeholders::_1,
std::placeholders::_2,
@@ -198,7 +194,7 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
json data = json::parse(json_req);
resp->id = llama->queue_tasks.get_new_id();
llama->queue_results.add_waiting_task_id(resp->id);
llama->request_completion(resp->id, -1, data, false, false);
llama->request_completion(resp->id, data, false, false, -1);
} catch (std::exception &e) {
snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
} catch (...) {
@@ -216,9 +212,9 @@ void llama_server_completion_next_result(const int task_id,
std::string result_json;
try {
atomicRecv ar(recv_counter);
server_task_result result = llama->queue_results.recv(task_id);
task_result result = llama->queue_results.recv(task_id);
result_json =
result.data.dump(-1, ' ', false, json::error_handler_t::replace);
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
resp->id = result.id;
resp->stop = result.stop;
resp->error = result.error;
@@ -363,10 +359,10 @@ void llama_server_embedding(const char *json_req, char **json_resp,
}
const int task_id = llama->queue_tasks.get_new_id();
llama->queue_results.add_waiting_task_id(task_id);
llama->request_completion(task_id, -1, {{"prompt", prompt}, {"n_predict", 0}}, false, true);
llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
atomicRecv ar(recv_counter);
server_task_result result = llama->queue_results.recv(task_id);
std::string result_json = result.data.dump();
task_result result = llama->queue_results.recv(task_id);
std::string result_json = result.result_json.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());

View File

@@ -18,19 +18,6 @@ sign() {
fi
}
# bundle_metal bundles ggml-common.h and ggml-metal.metal into a single file
bundle_metal() {
grep -v '#include "ggml-common.h"' "${LLAMACPP_DIR}/ggml-metal.metal" | grep -v '#pragma once' > "${LLAMACPP_DIR}/ggml-metal.metal.temp"
echo '#define GGML_COMMON_IMPL_METAL' > "${LLAMACPP_DIR}/ggml-metal.metal"
cat "${LLAMACPP_DIR}/ggml-common.h" | grep -v '#pragma once' >> "${LLAMACPP_DIR}/ggml-metal.metal"
cat "${LLAMACPP_DIR}/ggml-metal.metal.temp" >> "${LLAMACPP_DIR}/ggml-metal.metal"
rm "${LLAMACPP_DIR}/ggml-metal.metal.temp"
}
cleanup_metal() {
(cd ${LLAMACPP_DIR} && git checkout ggml-metal.metal)
}
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"
case "${GOARCH}" in
@@ -76,11 +63,9 @@ case "${GOARCH}" in
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
bundle_metal
build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
compress_libs
cleanup_metal
;;
*)
echo "GOARCH must be set"

View File

@@ -185,7 +185,7 @@ if [ -d "${ROCM_PATH}" ]; then
init_vars
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
build
# Record the ROCM dependencies
@@ -194,6 +194,12 @@ if [ -d "${ROCM_PATH}" ]; then
for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt"
done
# bomb out if for some reason we didn't get a few deps
if [ $(cat "${BUILD_DIR}/lib/deps.txt" | wc -l ) -lt 8 ] ; then
cat "${BUILD_DIR}/lib/deps.txt"
echo "ERROR: deps file short"
exit 1
fi
compress_libs
fi

View File

@@ -1,19 +1,21 @@
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index f255ad76..914ecfdd 100644
index 8fe5e0b1..3e82acb9 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1101,12 +1101,13 @@ struct server_context {
@@ -997,13 +997,15 @@ struct llama_server_context
slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache
}
- slot.add_token_string(result);
if (slot.params.stream) {
+
if (slot.params.stream)
{
send_partial_response(slot, result);
}
}
+ slot.add_token_string(result);
+
if (incomplete) {
if (incomplete)
{
slot.has_next_token = true;
}

View File

@@ -1,10 +1,10 @@
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index b14cca61..02bfd4b1 100644
index 8fe5e0b1..53bf39c1 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -29,6 +29,10 @@
@@ -31,6 +31,10 @@
#include <atomic>
#include <signal.h>
#include <memory>
+#ifdef GGML_USE_CUBLAS
+extern "C" GGML_CALL void ggml_free_cublas(void);
@@ -12,8 +12,8 @@ index b14cca61..02bfd4b1 100644
+
using json = nlohmann::json;
bool server_verbose = false;
@@ -664,6 +668,10 @@ struct server_context {
struct server_params {
@@ -363,6 +367,10 @@ struct llama_server_context
llama_free_model(model);
model = nullptr;
}
@@ -23,8 +23,8 @@ index b14cca61..02bfd4b1 100644
+#endif
}
bool load_model(const gpt_params & params_) {
@@ -3499,6 +3507,7 @@ int main(int argc, char ** argv) {
bool load_model(const gpt_params &params_)
@@ -3543,6 +3551,7 @@ int main(int argc, char **argv)
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
@@ -33,10 +33,10 @@ index b14cca61..02bfd4b1 100644
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index c207ff87..945708a4 100644
index 72bcec8c..6c934e8c 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -46,6 +46,7 @@
@@ -43,6 +43,7 @@
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
@@ -44,7 +44,7 @@ index c207ff87..945708a4 100644
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -8014,10 +8015,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
@@ -8751,10 +8752,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}
@@ -58,7 +58,7 @@ index c207ff87..945708a4 100644
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -8027,7 +8028,7 @@ GGML_CALL void ggml_init_cublas() {
@@ -8764,7 +8765,7 @@ GGML_CALL void ggml_init_cublas() {
#endif
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
@@ -67,7 +67,7 @@ index c207ff87..945708a4 100644
g_cublas_loaded = false;
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
return;
@@ -8098,7 +8099,7 @@ GGML_CALL void ggml_init_cublas() {
@@ -8835,7 +8836,7 @@ GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
@@ -76,7 +76,7 @@ index c207ff87..945708a4 100644
g_cublas_loaded = true;
}
}
@@ -11753,3 +11754,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
@@ -12490,3 +12491,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}
@@ -100,6 +100,7 @@ index c207ff87..945708a4 100644
+
+ g_cublas_initialized = false;
+}
\ No newline at end of file
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b1ebd61d..6dd58ddf 100644
--- a/ggml-cuda.h

View File

@@ -0,0 +1,44 @@
diff --git a/llama.cpp b/llama.cpp
index 4225f955..7b762f86 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4756,7 +4756,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
}
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
- return -1;
+ throw;
}
return 0;
@@ -12102,16 +12102,22 @@ struct llama_model * llama_load_model_from_file(
};
}
- int status = llama_model_load(path_model, *model, params);
- GGML_ASSERT(status <= 0);
- if (status < 0) {
- if (status == -1) {
- LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
- } else if (status == -2) {
- LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
+ try {
+ int status = llama_model_load(path_model, *model, params);
+ GGML_ASSERT(status <= 0);
+ if (status < 0) {
+ if (status == -1) {
+ LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
+ } else if (status == -2) {
+ LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
+ }
+ delete model;
+ return nullptr;
}
+ } catch (...) {
+ LLAMA_LOG_ERROR("%s: exception loading model\n", __func__);
delete model;
- return nullptr;
+ throw;
}
return model;

View File

@@ -1,10 +1,10 @@
diff --git a/llama.cpp b/llama.cpp
index b19616e8..519b9602 100644
index b27aa272..99372f9c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9938,7 +9938,7 @@ struct llm_tokenizer_wpm {
@@ -9360,7 +9360,7 @@ struct llm_tokenizer_wpm {
}
uint32_t to_lower(uint32_t code) {
- static const std::locale locale("en_US.UTF-8");
+ static const std::locale locale("");

View File

@@ -10,8 +10,8 @@ mkdir -p dist
for TARGETARCH in arm64 amd64; do
rm -rf llm/llama.cpp/build
GOOS=darwin GOARCH=$TARGETARCH go generate ./...
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -cover -o dist/ollama-darwin-$TARGETARCH-cov
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -o dist/ollama-darwin-$TARGETARCH
CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -cover -o dist/ollama-darwin-$TARGETARCH-cov
done
lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64

View File

@@ -53,7 +53,7 @@ function buildOllama() {
write-host "Building ollama CLI"
& go generate ./...
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& go build -ldflags "-s -w -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
& go build -trimpath -ldflags "-s -w -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
if ("${env:KEY_CONTAINER}") {
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
@@ -68,7 +68,7 @@ function buildApp() {
write-host "Building Ollama App"
cd "${script:SRC_DIR}\app"
& windres -l 0 -o ollama.syso ollama.rc
& go build -ldflags "-s -w -H windowsgui -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
& go build -trimpath -ldflags "-s -w -H windowsgui -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
if ("${env:KEY_CONTAINER}") {
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `

View File

@@ -795,10 +795,8 @@ func PruneLayers() error {
for _, blob := range blobs {
name := blob.Name()
if runtime.GOOS == "windows" {
name = strings.ReplaceAll(name, "-", ":")
}
if strings.HasPrefix(name, "sha256:") {
name = strings.ReplaceAll(name, "-", ":")
if strings.HasPrefix(name, "sha256-") {
deleteMap[name] = struct{}{}
}
}

View File

@@ -5,7 +5,6 @@ import (
"fmt"
"io"
"os"
"runtime"
"strings"
"golang.org/x/exp/slices"
@@ -47,10 +46,7 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
return nil, err
}
delimiter := ":"
if runtime.GOOS == "windows" {
delimiter = "-"
}
const delimiter = "-"
pattern := strings.Join([]string{"sha256", "*-partial"}, delimiter)
temp, err := os.CreateTemp(blobs, pattern)

View File

@@ -6,7 +6,6 @@ import (
"net/url"
"os"
"path/filepath"
"runtime"
"strings"
)
@@ -150,10 +149,7 @@ func GetBlobsPath(digest string) (string, error) {
return "", err
}
if runtime.GOOS == "windows" {
digest = strings.ReplaceAll(digest, ":", "-")
}
digest = strings.ReplaceAll(digest, ":", "-")
path := filepath.Join(dir, "blobs", digest)
dirPath := filepath.Dir(path)
if digest == "" {

View File

@@ -8,6 +8,7 @@ import (
"io"
"io/fs"
"log/slog"
"math"
"net"
"net/http"
"net/netip"
@@ -16,6 +17,7 @@ import (
"path/filepath"
"reflect"
"runtime"
"strconv"
"strings"
"sync"
"syscall"
@@ -207,7 +209,7 @@ func GenerateHandler(c *gin.Context) {
var sessionDuration time.Duration
if req.KeepAlive == nil {
sessionDuration = defaultSessionDuration
sessionDuration = getDefaultSessionDuration()
} else {
sessionDuration = req.KeepAlive.Duration
}
@@ -384,6 +386,32 @@ func GenerateHandler(c *gin.Context) {
streamResponse(c, ch)
}
func getDefaultSessionDuration() time.Duration {
if t, exists := os.LookupEnv("OLLAMA_KEEP_ALIVE"); exists {
v, err := strconv.Atoi(t)
if err != nil {
d, err := time.ParseDuration(t)
if err != nil {
return defaultSessionDuration
}
if d < 0 {
return time.Duration(math.MaxInt64)
}
return d
}
d := time.Duration(v) * time.Second
if d < 0 {
return time.Duration(math.MaxInt64)
}
return d
}
return defaultSessionDuration
}
func EmbeddingsHandler(c *gin.Context) {
loaded.mu.Lock()
defer loaded.mu.Unlock()
@@ -427,7 +455,7 @@ func EmbeddingsHandler(c *gin.Context) {
var sessionDuration time.Duration
if req.KeepAlive == nil {
sessionDuration = defaultSessionDuration
sessionDuration = getDefaultSessionDuration()
} else {
sessionDuration = req.KeepAlive.Duration
}
@@ -1228,7 +1256,7 @@ func ChatHandler(c *gin.Context) {
var sessionDuration time.Duration
if req.KeepAlive == nil {
sessionDuration = defaultSessionDuration
sessionDuration = getDefaultSessionDuration()
} else {
sessionDuration = req.KeepAlive.Duration
}