inital pass at replacing ":" with "-" in image on-disk names

Update README.md
add OLLAMA_KEEP_ALIVE to environment variable docs for ollama serve (#3127 )
2025-12-24 08:10:54 -05:00 · 2024-03-14 11:30:06 -07:00 · 2024-03-13 21:12:17 -07:00 · 2024-03-13 14:35:33 -07:00 · 2024-03-13 13:29:40 -07:00 · 2024-03-12 22:08:13 -07:00
27 changed files with 293 additions and 127 deletions
--- a/4
+++ b/4
@@ -92,7 +92,7 @@ COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/b
 COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/dist/deps/ ./dist/deps/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build .
+RUN go build -trimpath .

 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
@@ -103,7 +103,7 @@ COPY . .
 COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build .
+RUN go build -trimpath .

 # Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -0,0 +1,50 @@
+package api
+
+import (
+	"encoding/json"
+	"math"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestKeepAliveParsingFromJSON(t *testing.T) {
+	tests := []struct {
+		name string
+		req  string
+		exp  *Duration
+	}{
+		{
+			name: "Positive Integer",
+			req:  `{ "keep_alive": 42 }`,
+			exp:  &Duration{42 * time.Second},
+		},
+		{
+			name: "Positive Integer String",
+			req:  `{ "keep_alive": "42m" }`,
+			exp:  &Duration{42 * time.Minute},
+		},
+		{
+			name: "Negative Integer",
+			req:  `{ "keep_alive": -1 }`,
+			exp:  &Duration{math.MaxInt64},
+		},
+		{
+			name: "Negative Integer String",
+			req:  `{ "keep_alive": "-1m" }`,
+			exp:  &Duration{math.MaxInt64},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			var dec ChatRequest
+			err := json.Unmarshal([]byte(test.req), &dec)
+			require.NoError(t, err)
+
+			assert.Equal(t, test.exp, dec.KeepAlive)
+		})
+	}
+}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -970,9 +970,10 @@ func NewCLI() *cobra.Command {
 	serveCmd.SetUsageTemplate(serveCmd.UsageTemplate() + `
 Environment Variables:

-    OLLAMA_HOST       The host:port to bind to (default "127.0.0.1:11434")
-    OLLAMA_ORIGINS    A comma separated list of allowed origins.
-    OLLAMA_MODELS     The path to the models directory (default is "~/.ollama/models")
+    OLLAMA_HOST         The host:port to bind to (default "127.0.0.1:11434")
+    OLLAMA_ORIGINS      A comma separated list of allowed origins.
+    OLLAMA_MODELS       The path to the models directory (default is "~/.ollama/models")
+    OLLAMA_KEEP_ALIVE   The duration that models stay loaded in memory (default is "5m")
 `)

 	pullCmd := &cobra.Command{
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -103,9 +103,9 @@ func ReadSafeTensors(fn string, offset uint64) ([]llm.Tensor, uint64, error) {
 			return []llm.Tensor{}, 0, err
 		}

-		shape := [4]uint64{1, 1, 1, 1}
-		for cnt, s := range data.Shape {
-			shape[cnt] = uint64(s)
+		shape := []uint64{0, 0, 0, 0}
+		for i := range data.Shape {
+			shape[i] = uint64(data.Shape[i])
 		}

 		t := llm.Tensor{
--- a/docs/README.md
+++ b/docs/README.md
@@ -3,7 +3,7 @@
 ### Getting Started
 * [Quickstart](../README.md#quickstart)
 * [Examples](../examples)
-* [Importing models](./import.md) from GGUF, Pytorch and Safetensors
+* [Importing models](./import.md)
 * [Linux Documentation](./linux.md)
 * [Windows Documentation](./windows.md)
 * [Docker Documentation](https://hub.docker.com/r/ollama/ollama)
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -193,3 +193,13 @@ To unload the model and free up memory use:
 ```shell
 curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": 0}'
 ```
+
+## Controlling which GPUs to use
+
+By default, on Linux and Windows, Ollama will attempt to use Nvidia GPUs, or
+Radeon GPUs, and will use all the GPUs it can find. You can limit which GPUs
+will be utilized by setting the environment variable `CUDA_VISIBLE_DEVICES` for
+NVIDIA cards, or `HIP_VISIBLE_DEVICES` for Radeon GPUs to a comma delimited list
+of GPU IDs.  You can see the list of devices with GPU tools such as `nvidia-smi` or
+`rocminfo`. You can set to an invalid GPU ID (e.g., "-1") to bypass the GPU and
+fallback to CPU.
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -131,7 +131,7 @@ The `PARAMETER` instruction defines a parameter that can be set when the model i
 PARAMETER <parameter> <parametervalue>
 ```

-### Valid Parameters and Values
+#### Valid Parameters and Values

 | Parameter      | Description                                                                                                                                                                                                                                             | Value Type | Example Usage        |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
@@ -201,7 +201,22 @@ LICENSE """

 ### MESSAGE

-The `MESSAGE` instruction allows you to specify a message history for the model to use when responding:
+The `MESSAGE` instruction allows you to specify a message history for the model to use when responding. Use multiple iterations of the MESSAGE command to build up a conversation which will guide the model to answer in a similar way.
+
+```modelfile
+MESSAGE <role> <message>
+```
+
+#### Valid roles
+
+| Role      | Description                                                  |
+| --------- | ------------------------------------------------------------ |
+| system    | Alternate way of providing the SYSTEM message for the model. |
+| user      | An example message of what the user could have asked.        |
+| assistant | An example message of how the model should respond.          |
+
+
+#### Example conversation

 ```modelfile
 MESSAGE user Is Toronto in Canada?
@@ -212,6 +227,7 @@ MESSAGE user Is Ontario in Canada?
 MESSAGE assistant yes
 ```

+
 ## Notes

 - the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -109,7 +109,3 @@ which version to install.
 ```sh
 curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.27" sh
 ```
-
-## Known issues
-
-* N/A
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -40,19 +40,17 @@ func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
 	// TODO - does sort order matter?
 	devices := []string{}
 	for i := range ids {
-		slog.Debug(fmt.Sprintf("i=%d", i))
 		if _, skipped := skip[i]; skipped {
-			slog.Debug("skipped")
 			continue
 		}
 		devices = append(devices, strconv.Itoa(i))
 	}
-	slog.Debug(fmt.Sprintf("devices=%v", devices))

 	val := strings.Join(devices, ",")
 	err := os.Setenv("HIP_VISIBLE_DEVICES", val)
 	if err != nil {
 		slog.Warn(fmt.Sprintf("failed to set env: %s", err))
+	} else {
+		slog.Info("Setting HIP_VISIBLE_DEVICES=" + val)
 	}
-	slog.Debug("HIP_VISIBLE_DEVICES=" + val)
 }
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -24,6 +24,9 @@ const (
 	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
 	GPUUsedMemoryFileGlob  = "mem_banks/*/used_memory"
 	RocmStandardLocation   = "/opt/rocm/lib"
+
+	// TODO find a better way to detect iGPU instead of minimum memory
+	IGPUMemLimit = 1024 * 1024 * 1024 // 512G is what they typically report, so anything less than 1G must be iGPU
 )

 var (
@@ -146,8 +149,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 	resp.memInfo.DeviceCount = 0
 	resp.memInfo.TotalMemory = 0
 	resp.memInfo.FreeMemory = 0
+	slog.Debug("discovering VRAM for amdgpu devices")
 	if len(ids) == 0 {
-		slog.Debug("discovering all amdgpu devices")
 		entries, err := os.ReadDir(AMDNodesSysfsDir)
 		if err != nil {
 			slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
@@ -165,7 +168,7 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 			ids = append(ids, id)
 		}
 	}
-	slog.Debug(fmt.Sprintf("discovering amdgpu devices %v", ids))
+	slog.Debug(fmt.Sprintf("amdgpu devices %v", ids))

 	for _, id := range ids {
 		if _, skipped := skip[id]; skipped {
@@ -173,7 +176,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 		}
 		totalMemory := uint64(0)
 		usedMemory := uint64(0)
-		propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUTotalMemoryFileGlob)
+		// Adjust for sysfs vs HIP ids
+		propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id+1), GPUTotalMemoryFileGlob)
 		propFiles, err := filepath.Glob(propGlob)
 		if err != nil {
 			slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
@@ -205,6 +209,13 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 			}
 		}
 		if totalMemory == 0 {
+			slog.Warn(fmt.Sprintf("amdgpu [%d] reports zero total memory, skipping", id))
+			skip[id] = struct{}{}
+			continue
+		}
+		if totalMemory < IGPUMemLimit {
+			slog.Info(fmt.Sprintf("amdgpu [%d] appears to be an iGPU with %dM reported total memory, skipping", id, totalMemory/1024/1024))
+			skip[id] = struct{}{}
 			continue
 		}
 		usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
@@ -232,8 +243,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 			}
 			usedMemory += used
 		}
-		slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %d", id, totalMemory))
-		slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory  %d", id, (totalMemory - usedMemory)))
+		slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %dM", id, totalMemory/1024/1024))
+		slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory  %dM", id, (totalMemory-usedMemory)/1024/1024))
 		resp.memInfo.DeviceCount++
 		resp.memInfo.TotalMemory += totalMemory
 		resp.memInfo.FreeMemory += (totalMemory - usedMemory)
@@ -282,7 +293,7 @@ func AMDValidateLibDir() (string, error) {
 	}

 	// If we already have a rocm dependency wired, nothing more to do
-	rocmTargetDir := filepath.Join(payloadsDir, "rocm")
+	rocmTargetDir := filepath.Clean(filepath.Join(payloadsDir, "..", "rocm"))
 	if rocmLibUsable(rocmTargetDir) {
 		return rocmTargetDir, nil
 	}
@@ -358,6 +369,8 @@ func AMDDriverVersion() (string, error) {
 }

 func AMDGFXVersions() map[int]Version {
+	// The amdgpu driver always exposes the host CPU as node 0, but we have to skip that and subtract one
+	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
 	res := map[int]Version{}
 	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
 	for _, match := range matches {
@@ -373,17 +386,20 @@ func AMDGFXVersions() map[int]Version {
 			continue
 		}

+		if i == 0 {
+			// Skipping the CPU
+			continue
+		}
+		// Align with HIP IDs (zero is first GPU, not CPU)
+		i -= 1
+
 		scanner := bufio.NewScanner(fp)
 		for scanner.Scan() {
 			line := strings.TrimSpace(scanner.Text())
 			if strings.HasPrefix(line, "gfx_target_version") {
 				ver := strings.Fields(line)
 				if len(ver) != 2 || len(ver[1]) < 5 {
-
-					if ver[1] == "0" {
-						// Silently skip the CPU
-						continue
-					} else {
+					if ver[1] != "0" {
 						slog.Debug("malformed " + line)
 					}
 					res[i] = Version{
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -23,7 +23,9 @@ func PayloadsDir() (string, error) {
 		if err != nil {
 			return "", fmt.Errorf("failed to generate tmp dir: %w", err)
 		}
-		payloadsDir = tmpDir
+		// We create a distinct subdirectory for payloads within the tmpdir
+		// This will typically look like /tmp/ollama3208993108/runners on linux
+		payloadsDir = filepath.Join(tmpDir, "runners")
 	}
 	return payloadsDir, nil
 }
@@ -32,10 +34,12 @@ func Cleanup() {
 	lock.Lock()
 	defer lock.Unlock()
 	if payloadsDir != "" {
-		slog.Debug("cleaning up", "dir", payloadsDir)
-		err := os.RemoveAll(payloadsDir)
+		// We want to fully clean up the tmpdir parent of the payloads dir
+		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
+		slog.Debug("cleaning up", "dir", tmpDir)
+		err := os.RemoveAll(tmpDir)
 		if err != nil {
-			slog.Warn("failed to clean up", "dir", payloadsDir, "err", err)
+			slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
 		}
 	}
 }
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -155,8 +155,8 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
      }
    }

-    LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
-    LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.used);
+    LOG(h.verbose, "[%d] CUDA totalMem %llu\n", i, memInfo.total);
+    LOG(h.verbose, "[%d] CUDA usedMem %llu\n", i, memInfo.used);

    resp->total += memInfo.total;
    resp->free += memInfo.free;
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -149,7 +149,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts

 	slog.Info("Initializing llama server")
 	slog.Debug(fmt.Sprintf("server params: %+v", sparams))
-	initResp := newExtServerResp(128)
+	initResp := newExtServerResp(512)
 	defer freeExtServerResp(initResp)
 	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
 	if initResp.id < 0 {
@@ -198,6 +198,9 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu

 	if predict.Format == "json" {
 		request["grammar"] = jsonGrammar
+		if !strings.Contains(strings.ToLower(predict.Prompt), "json") {
+			slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
+		}
 	}

 	retryDelay := 100 * time.Microsecond
@@ -225,17 +228,14 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 		}

 		retryNeeded := false
+		// keep track of the last token generated, this is used to abort if the model starts looping
+		var lastToken string
+		var tokenRepeat int
 	out:
 		for {
 			select {
 			case <-ctx.Done():
-				// This handles the request cancellation
-				C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
-				if resp.id < 0 {
-					return extServerResponseToErr(resp)
-				} else {
-					return nil
-				}
+				return cancelCompletion(llm, resp)
 			default:
 				var result C.ext_server_task_result_t
 				C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
@@ -258,6 +258,20 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 					break out
 				}

+				switch {
+				case strings.TrimSpace(p.Content) == lastToken:
+					tokenRepeat++
+				default:
+					lastToken = strings.TrimSpace(p.Content)
+					tokenRepeat = 0
+				}
+
+				// 30 picked as an arbitrary max token repeat limit, modify as needed
+				if tokenRepeat > 30 {
+					slog.Debug("prediction aborted, token repeat limit reached")
+					return cancelCompletion(llm, resp)
+				}
+
 				if p.Content != "" {
 					fn(PredictResult{
 						Content: p.Content,
@@ -285,6 +299,15 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 	return fmt.Errorf("max retries exceeded")
 }

+func cancelCompletion(llm *dynExtServer, resp C.ext_server_resp_t) error {
+	C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
+	if resp.id < 0 {
+		return extServerResponseToErr(resp)
+	} else {
+		return nil
+	}
+}
+
 func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
 	data, err := json.Marshal(TokenizeRequest{Content: prompt})
 	if err != nil {
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -26,7 +26,7 @@
 #endif // GGML_USE_CUBLAS

 // Expose the llama server as a callable extern "C" API
-server_context *llama = NULL;
+llama_server_context *llama = NULL;
 std::thread ext_server_thread;
 bool shutting_down = false;
 std::atomic_int recv_counter;
@@ -57,7 +57,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
  err->id = 0;
  err->msg[0] = '\0';
  try {
-    llama = new server_context;
+    llama = new llama_server_context;
    gpt_params params;
    params.n_ctx = sparams->n_ctx;
    params.n_batch = sparams->n_batch;
@@ -114,18 +114,14 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    // load the model
-    if (!llama->load_model(params)) {
-      // TODO - consider modifying the logging logic or patching load_model so
-      // we can capture more detailed error messages and pass them back to the
-      // caller for better UX
-      err->id = -1;
-      snprintf(err->msg, err->msg_len, "error loading model %s",
-               params.model.c_str());
-      return;
-    }
+  if (!llama->load_model(params)) { 
+    // an error occurred that was not thrown
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
+    return;
+  }

-    llama->init();
+    llama->initialize();
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
@@ -144,13 +140,13 @@ void llama_server_start() {
      LOG_TEE("llama server main loop starting\n");
      ggml_time_init();
      llama->queue_tasks.on_new_task(std::bind(
-        &server_context::process_single_task, llama, std::placeholders::_1));
+        &llama_server_context::process_single_task, llama, std::placeholders::_1));
      llama->queue_tasks.on_finish_multitask(std::bind(
-        &server_context::on_finish_multitask, llama, std::placeholders::_1));
+        &llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
      llama->queue_tasks.on_run_slots(std::bind(
-        &server_context::update_slots, llama));
+        &llama_server_context::update_slots, llama));
      llama->queue_results.on_multitask_update(std::bind(
-          &server_queue::update_multitask,
+          &llama_server_queue::update_multitask,
          &llama->queue_tasks,
          std::placeholders::_1,
          std::placeholders::_2,
@@ -198,7 +194,7 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
    json data = json::parse(json_req);
    resp->id = llama->queue_tasks.get_new_id();
    llama->queue_results.add_waiting_task_id(resp->id);
-    llama->request_completion(resp->id, -1, data, false, false);
+    llama->request_completion(resp->id, data, false, false, -1);
  } catch (std::exception &e) {
    snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
  } catch (...) {
@@ -216,9 +212,9 @@ void llama_server_completion_next_result(const int task_id,
  std::string result_json;
  try {
    atomicRecv ar(recv_counter);
-    server_task_result result = llama->queue_results.recv(task_id);
+    task_result result = llama->queue_results.recv(task_id);
    result_json =
-        result.data.dump(-1, ' ', false, json::error_handler_t::replace);
+        result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
    resp->id = result.id;
    resp->stop = result.stop;
    resp->error = result.error;
@@ -363,10 +359,10 @@ void llama_server_embedding(const char *json_req, char **json_resp,
    }
    const int task_id = llama->queue_tasks.get_new_id();
    llama->queue_results.add_waiting_task_id(task_id);
-    llama->request_completion(task_id, -1, {{"prompt", prompt}, {"n_predict", 0}}, false, true);
+    llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
    atomicRecv ar(recv_counter);
-    server_task_result result = llama->queue_results.recv(task_id);
-    std::string result_json = result.data.dump();
+    task_result result = llama->queue_results.recv(task_id);
+    std::string result_json = result.result_json.dump();
    const std::string::size_type size = result_json.size() + 1;
    *json_resp = new char[size];
    snprintf(*json_resp, size, "%s", result_json.c_str());
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -18,19 +18,6 @@ sign() {
    fi
 }

-# bundle_metal bundles ggml-common.h and ggml-metal.metal into a single file
-bundle_metal() {
-    grep -v '#include "ggml-common.h"' "${LLAMACPP_DIR}/ggml-metal.metal" | grep -v '#pragma once' > "${LLAMACPP_DIR}/ggml-metal.metal.temp"
-    echo '#define GGML_COMMON_IMPL_METAL' > "${LLAMACPP_DIR}/ggml-metal.metal"
-    cat "${LLAMACPP_DIR}/ggml-common.h" | grep -v '#pragma once' >> "${LLAMACPP_DIR}/ggml-metal.metal"
-    cat  "${LLAMACPP_DIR}/ggml-metal.metal.temp" >> "${LLAMACPP_DIR}/ggml-metal.metal"
-    rm "${LLAMACPP_DIR}/ggml-metal.metal.temp"
-}
-
-cleanup_metal() {
-    (cd ${LLAMACPP_DIR} && git checkout ggml-metal.metal)
-}
-
 COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"

 case "${GOARCH}" in
@@ -76,11 +63,9 @@ case "${GOARCH}" in
    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
-    bundle_metal
    build
    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
    compress_libs
-    cleanup_metal
    ;;
 *)
    echo "GOARCH must be set"
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -185,7 +185,7 @@ if [ -d "${ROCM_PATH}" ]; then
    init_vars
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
+    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
    build

    # Record the ROCM dependencies
@@ -194,6 +194,12 @@ if [ -d "${ROCM_PATH}" ]; then
    for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
        echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt"
    done
+    # bomb out if for some reason we didn't get a few deps
+    if [ $(cat "${BUILD_DIR}/lib/deps.txt" | wc -l ) -lt 8 ] ; then
+        cat "${BUILD_DIR}/lib/deps.txt"
+        echo "ERROR: deps file short"
+        exit 1
+    fi
    compress_libs
 fi

--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/patches/01-cache.diff
+++ b/llm/patches/01-cache.diff
@@ -1,19 +1,21 @@
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index f255ad76..914ecfdd 100644
+index 8fe5e0b1..3e82acb9 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
-@@ -1101,12 +1101,13 @@ struct server_context {
+@@ -997,13 +997,15 @@ struct llama_server_context
+                 slot.n_sent_text += result.text_to_send.size();
                 // add the token to slot queue and cache
             }
- 
 -            slot.add_token_string(result);
-             if (slot.params.stream) {
+
+             if (slot.params.stream)
+             {
                 send_partial_response(slot, result);
             }
         }
 
 +        slot.add_token_string(result);
 +
-         if (incomplete) {
+         if (incomplete)
+         {
             slot.has_next_token = true;
-         }
--- a/llm/patches/02-cudaleaks.diff
+++ b/llm/patches/02-cudaleaks.diff
@@ -1,10 +1,10 @@
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index b14cca61..02bfd4b1 100644
+index 8fe5e0b1..53bf39c1 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
-@@ -29,6 +29,10 @@
+@@ -31,6 +31,10 @@
+ #include <atomic>
 #include <signal.h>
- #include <memory>
 
 +#ifdef GGML_USE_CUBLAS
 +extern "C" GGML_CALL void ggml_free_cublas(void);
@@ -12,8 +12,8 @@ index b14cca61..02bfd4b1 100644
 +
 using json = nlohmann::json;
 
- bool server_verbose = false;
-@@ -664,6 +668,10 @@ struct server_context {
+ struct server_params {
+@@ -363,6 +367,10 @@ struct llama_server_context
             llama_free_model(model);
             model = nullptr;
         }
@@ -23,8 +23,8 @@ index b14cca61..02bfd4b1 100644
 +#endif
     }
 
-     bool load_model(const gpt_params & params_) {
-@@ -3499,6 +3507,7 @@ int main(int argc, char ** argv) {
+     bool load_model(const gpt_params &params_)
+@@ -3543,6 +3551,7 @@ int main(int argc, char **argv)
     sigemptyset (&sigint_action.sa_mask);
     sigint_action.sa_flags = 0;
     sigaction(SIGINT, &sigint_action, NULL);
@@ -33,10 +33,10 @@ index b14cca61..02bfd4b1 100644
     auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
         return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
 diff --git a/ggml-cuda.cu b/ggml-cuda.cu
-index c207ff87..945708a4 100644
+index 72bcec8c..6c934e8c 100644
 --- a/ggml-cuda.cu
 +++ b/ggml-cuda.cu
-@@ -46,6 +46,7 @@
+@@ -43,6 +43,7 @@
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
 #define cublasCreate hipblasCreate
@@ -44,7 +44,7 @@ index c207ff87..945708a4 100644
 #define cublasGemmEx hipblasGemmEx
 #define cublasGemmBatchedEx hipblasGemmBatchedEx
 #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-@@ -8014,10 +8015,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
+@@ -8751,10 +8752,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
     return g_cublas_loaded;
 }
 
@@ -58,7 +58,7 @@ index c207ff87..945708a4 100644
 
 #ifdef __HIP_PLATFORM_AMD__
         // Workaround for a rocBLAS bug when using multiple graphics cards:
-@@ -8027,7 +8028,7 @@ GGML_CALL void ggml_init_cublas() {
+@@ -8764,7 +8765,7 @@ GGML_CALL void ggml_init_cublas() {
 #endif
 
         if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
@@ -67,7 +67,7 @@ index c207ff87..945708a4 100644
             g_cublas_loaded = false;
             fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
             return;
-@@ -8098,7 +8099,7 @@ GGML_CALL void ggml_init_cublas() {
+@@ -8835,7 +8836,7 @@ GGML_CALL void ggml_init_cublas() {
         // configure logging to stdout
         // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
 
@@ -76,7 +76,7 @@ index c207ff87..945708a4 100644
         g_cublas_loaded = true;
     }
 }
-@@ -11753,3 +11754,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
+@@ -12490,3 +12491,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
     }
     return device_count;
 }
@@ -100,6 +100,7 @@ index c207ff87..945708a4 100644
 +
 +    g_cublas_initialized = false;
 +}
+\ No newline at end of file
 diff --git a/ggml-cuda.h b/ggml-cuda.h
 index b1ebd61d..6dd58ddf 100644
 --- a/ggml-cuda.h
--- a/llm/patches/03-load_exception.diff
+++ b/llm/patches/03-load_exception.diff
@@ -0,0 +1,44 @@
+diff --git a/llama.cpp b/llama.cpp
+index 4225f955..7b762f86 100644
+--- a/llama.cpp
+++ b/llama.cpp
+@@ -4756,7 +4756,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
+         }
+     } catch (const std::exception & err) {
+         LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
+-        return -1;
+        throw;
+     }
+ 
+     return 0;
+@@ -12102,16 +12102,22 @@ struct llama_model * llama_load_model_from_file(
+         };
+     }
+ 
+-    int status = llama_model_load(path_model, *model, params);
+-    GGML_ASSERT(status <= 0);
+-    if (status < 0) {
+-        if (status == -1) {
+-            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
+-        } else if (status == -2) {
+-            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
+    try {
+        int status = llama_model_load(path_model, *model, params);
+        GGML_ASSERT(status <= 0);
+        if (status < 0) {
+            if (status == -1) {
+                LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
+            } else if (status == -2) {
+                LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
+            }
+            delete model;
+            return nullptr;
+         }
+    } catch (...) {
+        LLAMA_LOG_ERROR("%s: exception loading model\n", __func__);
+         delete model;
+-        return nullptr;
+        throw;
+     }
+ 
+     return model;
--- a/llm/patches/04-locale.diff
+++ b/llm/patches/04-locale.diff
@@ -1,10 +1,10 @@
 diff --git a/llama.cpp b/llama.cpp
-index b19616e8..519b9602 100644
+index b27aa272..99372f9c 100644
 --- a/llama.cpp
 +++ b/llama.cpp
-@@ -9938,7 +9938,7 @@ struct llm_tokenizer_wpm {
+@@ -9360,7 +9360,7 @@ struct llm_tokenizer_wpm {
     }
-
+ 
     uint32_t to_lower(uint32_t code) {
 -        static const std::locale locale("en_US.UTF-8");
 +        static const std::locale locale("");
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -10,8 +10,8 @@ mkdir -p dist
 for TARGETARCH in arm64 amd64; do
    rm -rf llm/llama.cpp/build
    GOOS=darwin GOARCH=$TARGETARCH go generate ./...
-    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH
-    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -cover -o dist/ollama-darwin-$TARGETARCH-cov
+    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -o dist/ollama-darwin-$TARGETARCH
+    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -cover -o dist/ollama-darwin-$TARGETARCH-cov
 done

 lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -53,7 +53,7 @@ function buildOllama() {
    write-host "Building ollama CLI"
    & go generate ./...
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    & go build -ldflags "-s -w -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
+    & go build -trimpath -ldflags "-s -w -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    if ("${env:KEY_CONTAINER}") {
        & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
@@ -68,7 +68,7 @@ function buildApp() {
    write-host "Building Ollama App"
    cd "${script:SRC_DIR}\app"
    & windres -l 0 -o ollama.syso ollama.rc
-    & go build -ldflags "-s -w -H windowsgui -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
+    & go build -trimpath -ldflags "-s -w -H windowsgui -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    if ("${env:KEY_CONTAINER}") {
        & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
--- a/server/images.go
+++ b/server/images.go
@@ -795,10 +795,8 @@ func PruneLayers() error {

 	for _, blob := range blobs {
 		name := blob.Name()
-		if runtime.GOOS == "windows" {
-			name = strings.ReplaceAll(name, "-", ":")
-		}
-		if strings.HasPrefix(name, "sha256:") {
+		name = strings.ReplaceAll(name, "-", ":")
+		if strings.HasPrefix(name, "sha256-") {
 			deleteMap[name] = struct{}{}
 		}
 	}
--- a/server/layers.go
+++ b/server/layers.go
@@ -5,7 +5,6 @@ import (
 	"fmt"
 	"io"
 	"os"
-	"runtime"
 	"strings"

 	"golang.org/x/exp/slices"
@@ -47,10 +46,7 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
 		return nil, err
 	}

-	delimiter := ":"
-	if runtime.GOOS == "windows" {
-		delimiter = "-"
-	}
+	const delimiter = "-"

 	pattern := strings.Join([]string{"sha256", "*-partial"}, delimiter)
 	temp, err := os.CreateTemp(blobs, pattern)
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -6,7 +6,6 @@ import (
 	"net/url"
 	"os"
 	"path/filepath"
-	"runtime"
 	"strings"
 )

@@ -150,10 +149,7 @@ func GetBlobsPath(digest string) (string, error) {
 		return "", err
 	}

-	if runtime.GOOS == "windows" {
-		digest = strings.ReplaceAll(digest, ":", "-")
-	}
-
+	digest = strings.ReplaceAll(digest, ":", "-")
 	path := filepath.Join(dir, "blobs", digest)
 	dirPath := filepath.Dir(path)
 	if digest == "" {
--- a/server/routes.go
+++ b/server/routes.go
@@ -8,6 +8,7 @@ import (
 	"io"
 	"io/fs"
 	"log/slog"
+	"math"
 	"net"
 	"net/http"
 	"net/netip"
@@ -16,6 +17,7 @@ import (
 	"path/filepath"
 	"reflect"
 	"runtime"
+	"strconv"
 	"strings"
 	"sync"
 	"syscall"
@@ -207,7 +209,7 @@ func GenerateHandler(c *gin.Context) {

 	var sessionDuration time.Duration
 	if req.KeepAlive == nil {
-		sessionDuration = defaultSessionDuration
+		sessionDuration = getDefaultSessionDuration()
 	} else {
 		sessionDuration = req.KeepAlive.Duration
 	}
@@ -384,6 +386,32 @@ func GenerateHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }

+func getDefaultSessionDuration() time.Duration {
+	if t, exists := os.LookupEnv("OLLAMA_KEEP_ALIVE"); exists {
+		v, err := strconv.Atoi(t)
+		if err != nil {
+			d, err := time.ParseDuration(t)
+			if err != nil {
+				return defaultSessionDuration
+			}
+
+			if d < 0 {
+				return time.Duration(math.MaxInt64)
+			}
+
+			return d
+		}
+
+		d := time.Duration(v) * time.Second
+		if d < 0 {
+			return time.Duration(math.MaxInt64)
+		}
+		return d
+	}
+
+	return defaultSessionDuration
+}
+
 func EmbeddingsHandler(c *gin.Context) {
 	loaded.mu.Lock()
 	defer loaded.mu.Unlock()
@@ -427,7 +455,7 @@ func EmbeddingsHandler(c *gin.Context) {

 	var sessionDuration time.Duration
 	if req.KeepAlive == nil {
-		sessionDuration = defaultSessionDuration
+		sessionDuration = getDefaultSessionDuration()
 	} else {
 		sessionDuration = req.KeepAlive.Duration
 	}
@@ -1228,7 +1256,7 @@ func ChatHandler(c *gin.Context) {

 	var sessionDuration time.Duration
 	if req.KeepAlive == nil {
-		sessionDuration = defaultSessionDuration
+		sessionDuration = getDefaultSessionDuration()
 	} else {
 		sessionDuration = req.KeepAlive.Duration
 	}
Author	SHA1	Message	Date
Blake Mizerany	d9ea2e5c7a	inital pass at replacing ":" with "-" in image on-disk names	2024-03-14 11:30:06 -07:00
Jeffrey Morgan	5ce997a7b9	Update README.md	2024-03-13 21:12:17 -07:00
Jeffrey Morgan	672ffe9b7d	add `OLLAMA_KEEP_ALIVE` to environment variable docs for `ollama serve` (#3127 )	2024-03-13 14:35:33 -07:00
Patrick Devine	47cfe58af5	Default Keep Alive environment variable (#3094 ) --------- Co-authored-by: Chris-AS1 <8493773+Chris-AS1@users.noreply.github.com>	2024-03-13 13:29:40 -07:00
Jeffrey Morgan	e72c567cfd	restore locale patch (#3091 )	2024-03-12 22:08:13 -07:00
Bruce MacDonald	3e22611200	token repeat limit for prediction requests (#3080 )	2024-03-12 22:08:25 -04:00
Daniel Hiltgen	a54d4a28dc	Merge pull request #3088 from dhiltgen/rocm_igpu_linux Fix iGPU detection for linux	2024-03-12 17:20:27 -07:00
Daniel Hiltgen	82b0c7c27e	Fix iGPU detection for linux This fixes a few bugs in the new sysfs discovery logic. iGPUs are now correctly identified by their <1G VRAM reported. the sysfs IDs are off by one compared to what HIP wants due to the CPU being reported in amdgpu, but HIP only cares about GPUs.	2024-03-12 16:57:19 -07:00
Patrick Devine	ba7cf7fb66	add more docs on for the modelfile message command (#3087 )	2024-03-12 16:41:41 -07:00
Bruce MacDonald	2f804068bd	warn when json format is expected but not mentioned in prompt (#3081 )	2024-03-12 19:07:11 -04:00
Daniel Hiltgen	34d00f90b1	Merge pull request #3070 from dhiltgen/visible_devices Add docs explaining GPU selection env vars	2024-03-12 11:36:46 -07:00
Daniel Hiltgen	b53229a2ed	Add docs explaining GPU selection env vars	2024-03-12 11:33:06 -07:00
racerole	53c107e20e	chore: fix typo (#3073 ) Signed-off-by: racerole <jiangyifeng@outlook.com>	2024-03-12 14:09:22 -04:00
mofanke	51578d8573	fix gpu_info_cuda.c compile warning (#3077 )	2024-03-12 14:08:40 -04:00
Jeffrey Morgan	b5fcd9d3aa	use `-trimpath` when building releases (#3069 )	2024-03-11 15:58:46 -07:00
Bruce MacDonald	b80661e8c7	relay load model errors to the client (#3065 )	2024-03-11 16:48:27 -04:00
Jeffrey Morgan	6d3adfbea2	Update troubleshooting.md	2024-03-11 13:22:28 -07:00
Jeffrey Morgan	369eda65f5	update llama.cpp submodule to `ceca1ae` (#3064 )	2024-03-11 12:57:48 -07:00
Michael Yang	f878e91070	Merge pull request #3044 from ollama/mxyng/fix-convert-shape convert: fix shape	2024-03-11 09:56:57 -07:00
Daniel Hiltgen	0d651478e4	Merge pull request #3056 from dhiltgen/rocm_link_clash Avoid rocm runner and dependency clash	2024-03-11 09:48:48 -07:00
Michael Yang	9ea492f1ce	convert: fix shape	2024-03-11 09:41:01 -07:00
Daniel Hiltgen	bc13da2bfe	Avoid rocm runner and dependency clash Putting the rocm symlink next to the runners is risky. This moves the payloads into a subdir to avoid potential clashes.	2024-03-11 09:33:22 -07:00
Jeffrey Morgan	41b00b9856	fix `03-locale.diff`	2024-03-10 16:21:05 -07:00
Daniel Hiltgen	c2a8ed48e7	Merge pull request #3048 from dhiltgen/harden_rocm_deps Harden for deps file being empty (or short)	2024-03-10 15:17:22 -07:00
Daniel Hiltgen	3dc1bb6a35	Harden for deps file being empty (or short)	2024-03-10 14:45:38 -07:00