disable execstack for amd libraries

2025-12-25 08:40:47 -05:00 · 2024-03-10 15:08:46 -07:00
26 changed files with 121 additions and 337 deletions
--- a/6
+++ b/6
@@ -42,7 +42,7 @@ ARG AMDGPU_TARGETS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 RUN mkdir /tmp/scratch && \
    for dep in $(cat /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/x86_64/rocm*/lib/deps.txt) ; do \
-        cp ${dep} /tmp/scratch/ || exit 1 ; \
+    cp ${dep} /tmp/scratch/ || exit 1 ; \
    done && \
    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
    mkdir -p /go/src/github.com/jmorganca/ollama/dist/deps/ && \
@@ -92,7 +92,7 @@ COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/b
 COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/dist/deps/ ./dist/deps/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath .
+RUN go build .

 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
@@ -103,7 +103,7 @@ COPY . .
 COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath .
+RUN go build .

 # Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -1,50 +0,0 @@
-package api
-
-import (
-	"encoding/json"
-	"math"
-	"testing"
-	"time"
-
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-)
-
-func TestKeepAliveParsingFromJSON(t *testing.T) {
-	tests := []struct {
-		name string
-		req  string
-		exp  *Duration
-	}{
-		{
-			name: "Positive Integer",
-			req:  `{ "keep_alive": 42 }`,
-			exp:  &Duration{42 * time.Second},
-		},
-		{
-			name: "Positive Integer String",
-			req:  `{ "keep_alive": "42m" }`,
-			exp:  &Duration{42 * time.Minute},
-		},
-		{
-			name: "Negative Integer",
-			req:  `{ "keep_alive": -1 }`,
-			exp:  &Duration{math.MaxInt64},
-		},
-		{
-			name: "Negative Integer String",
-			req:  `{ "keep_alive": "-1m" }`,
-			exp:  &Duration{math.MaxInt64},
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			var dec ChatRequest
-			err := json.Unmarshal([]byte(test.req), &dec)
-			require.NoError(t, err)
-
-			assert.Equal(t, test.exp, dec.KeepAlive)
-		})
-	}
-}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -900,7 +900,8 @@ func NewCLI() *cobra.Command {
 	cobra.EnableCommandSorting = false

 	if runtime.GOOS == "windows" {
-		console.ConsoleFromFile(os.Stdin) //nolint:errcheck
+		// Enable colorful ANSI escape code in Windows terminal (disabled by default)
+		console.ConsoleFromFile(os.Stdout) //nolint:errcheck
 	}

 	rootCmd := &cobra.Command{
@@ -969,10 +970,9 @@ func NewCLI() *cobra.Command {
 	serveCmd.SetUsageTemplate(serveCmd.UsageTemplate() + `
 Environment Variables:

-    OLLAMA_HOST         The host:port to bind to (default "127.0.0.1:11434")
-    OLLAMA_ORIGINS      A comma separated list of allowed origins.
-    OLLAMA_MODELS       The path to the models directory (default is "~/.ollama/models")
-    OLLAMA_KEEP_ALIVE   The duration that models stay loaded in memory (default is "5m")
+    OLLAMA_HOST       The host:port to bind to (default "127.0.0.1:11434")
+    OLLAMA_ORIGINS    A comma separated list of allowed origins.
+    OLLAMA_MODELS     The path to the models directory (default is "~/.ollama/models")
 `)

 	pullCmd := &cobra.Command{
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -103,9 +103,9 @@ func ReadSafeTensors(fn string, offset uint64) ([]llm.Tensor, uint64, error) {
 			return []llm.Tensor{}, 0, err
 		}

-		shape := []uint64{0, 0, 0, 0}
-		for i := range data.Shape {
-			shape[i] = uint64(data.Shape[i])
+		shape := [4]uint64{1, 1, 1, 1}
+		for cnt, s := range data.Shape {
+			shape[cnt] = uint64(s)
 		}

 		t := llm.Tensor{
--- a/docs/README.md
+++ b/docs/README.md
@@ -3,7 +3,7 @@
 ### Getting Started
 * [Quickstart](../README.md#quickstart)
 * [Examples](../examples)
-* [Importing models](./import.md)
+* [Importing models](./import.md) from GGUF, Pytorch and Safetensors
 * [Linux Documentation](./linux.md)
 * [Windows Documentation](./windows.md)
 * [Docker Documentation](https://hub.docker.com/r/ollama/ollama)
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -193,13 +193,3 @@ To unload the model and free up memory use:
 ```shell
 curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": 0}'
 ```
-
-## Controlling which GPUs to use
-
-By default, on Linux and Windows, Ollama will attempt to use Nvidia GPUs, or
-Radeon GPUs, and will use all the GPUs it can find. You can limit which GPUs
-will be utilized by setting the environment variable `CUDA_VISIBLE_DEVICES` for
-NVIDIA cards, or `HIP_VISIBLE_DEVICES` for Radeon GPUs to a comma delimited list
-of GPU IDs.  You can see the list of devices with GPU tools such as `nvidia-smi` or
-`rocminfo`. You can set to an invalid GPU ID (e.g., "-1") to bypass the GPU and
-fallback to CPU.
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -131,7 +131,7 @@ The `PARAMETER` instruction defines a parameter that can be set when the model i
 PARAMETER <parameter> <parametervalue>
 ```

-#### Valid Parameters and Values
+### Valid Parameters and Values

 | Parameter      | Description                                                                                                                                                                                                                                             | Value Type | Example Usage        |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
@@ -201,22 +201,7 @@ LICENSE """

 ### MESSAGE

-The `MESSAGE` instruction allows you to specify a message history for the model to use when responding. Use multiple iterations of the MESSAGE command to build up a conversation which will guide the model to answer in a similar way.
-
-```modelfile
-MESSAGE <role> <message>
-```
-
-#### Valid roles
-
-| Role      | Description                                                  |
-| --------- | ------------------------------------------------------------ |
-| system    | Alternate way of providing the SYSTEM message for the model. |
-| user      | An example message of what the user could have asked.        |
-| assistant | An example message of how the model should respond.          |
-
-
-#### Example conversation
+The `MESSAGE` instruction allows you to specify a message history for the model to use when responding:

 ```modelfile
 MESSAGE user Is Toronto in Canada?
@@ -227,7 +212,6 @@ MESSAGE user Is Ontario in Canada?
 MESSAGE assistant yes
 ```

-
 ## Notes

 - the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -109,3 +109,7 @@ which version to install.
 ```sh
 curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.27" sh
 ```
+
+## Known issues
+
+* N/A
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -40,17 +40,19 @@ func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
 	// TODO - does sort order matter?
 	devices := []string{}
 	for i := range ids {
+		slog.Debug(fmt.Sprintf("i=%d", i))
 		if _, skipped := skip[i]; skipped {
+			slog.Debug("skipped")
 			continue
 		}
 		devices = append(devices, strconv.Itoa(i))
 	}
+	slog.Debug(fmt.Sprintf("devices=%v", devices))

 	val := strings.Join(devices, ",")
 	err := os.Setenv("HIP_VISIBLE_DEVICES", val)
 	if err != nil {
 		slog.Warn(fmt.Sprintf("failed to set env: %s", err))
-	} else {
-		slog.Info("Setting HIP_VISIBLE_DEVICES=" + val)
 	}
+	slog.Debug("HIP_VISIBLE_DEVICES=" + val)
 }
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -24,9 +24,6 @@ const (
 	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
 	GPUUsedMemoryFileGlob  = "mem_banks/*/used_memory"
 	RocmStandardLocation   = "/opt/rocm/lib"
-
-	// TODO find a better way to detect iGPU instead of minimum memory
-	IGPUMemLimit = 1024 * 1024 * 1024 // 512G is what they typically report, so anything less than 1G must be iGPU
 )

 var (
@@ -149,8 +146,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 	resp.memInfo.DeviceCount = 0
 	resp.memInfo.TotalMemory = 0
 	resp.memInfo.FreeMemory = 0
-	slog.Debug("discovering VRAM for amdgpu devices")
 	if len(ids) == 0 {
+		slog.Debug("discovering all amdgpu devices")
 		entries, err := os.ReadDir(AMDNodesSysfsDir)
 		if err != nil {
 			slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
@@ -168,7 +165,7 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 			ids = append(ids, id)
 		}
 	}
-	slog.Debug(fmt.Sprintf("amdgpu devices %v", ids))
+	slog.Debug(fmt.Sprintf("discovering amdgpu devices %v", ids))

 	for _, id := range ids {
 		if _, skipped := skip[id]; skipped {
@@ -176,8 +173,7 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 		}
 		totalMemory := uint64(0)
 		usedMemory := uint64(0)
-		// Adjust for sysfs vs HIP ids
-		propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id+1), GPUTotalMemoryFileGlob)
+		propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUTotalMemoryFileGlob)
 		propFiles, err := filepath.Glob(propGlob)
 		if err != nil {
 			slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
@@ -209,13 +205,6 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 			}
 		}
 		if totalMemory == 0 {
-			slog.Warn(fmt.Sprintf("amdgpu [%d] reports zero total memory, skipping", id))
-			skip[id] = struct{}{}
-			continue
-		}
-		if totalMemory < IGPUMemLimit {
-			slog.Info(fmt.Sprintf("amdgpu [%d] appears to be an iGPU with %dM reported total memory, skipping", id, totalMemory/1024/1024))
-			skip[id] = struct{}{}
 			continue
 		}
 		usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
@@ -243,8 +232,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
 			}
 			usedMemory += used
 		}
-		slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %dM", id, totalMemory/1024/1024))
-		slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory  %dM", id, (totalMemory-usedMemory)/1024/1024))
+		slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %d", id, totalMemory))
+		slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory  %d", id, (totalMemory - usedMemory)))
 		resp.memInfo.DeviceCount++
 		resp.memInfo.TotalMemory += totalMemory
 		resp.memInfo.FreeMemory += (totalMemory - usedMemory)
@@ -293,7 +282,7 @@ func AMDValidateLibDir() (string, error) {
 	}

 	// If we already have a rocm dependency wired, nothing more to do
-	rocmTargetDir := filepath.Clean(filepath.Join(payloadsDir, "..", "rocm"))
+	rocmTargetDir := filepath.Join(payloadsDir, "rocm")
 	if rocmLibUsable(rocmTargetDir) {
 		return rocmTargetDir, nil
 	}
@@ -369,8 +358,6 @@ func AMDDriverVersion() (string, error) {
 }

 func AMDGFXVersions() map[int]Version {
-	// The amdgpu driver always exposes the host CPU as node 0, but we have to skip that and subtract one
-	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
 	res := map[int]Version{}
 	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
 	for _, match := range matches {
@@ -386,20 +373,17 @@ func AMDGFXVersions() map[int]Version {
 			continue
 		}

-		if i == 0 {
-			// Skipping the CPU
-			continue
-		}
-		// Align with HIP IDs (zero is first GPU, not CPU)
-		i -= 1
-
 		scanner := bufio.NewScanner(fp)
 		for scanner.Scan() {
 			line := strings.TrimSpace(scanner.Text())
 			if strings.HasPrefix(line, "gfx_target_version") {
 				ver := strings.Fields(line)
 				if len(ver) != 2 || len(ver[1]) < 5 {
-					if ver[1] != "0" {
+
+					if ver[1] == "0" {
+						// Silently skip the CPU
+						continue
+					} else {
 						slog.Debug("malformed " + line)
 					}
 					res[i] = Version{
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -23,9 +23,7 @@ func PayloadsDir() (string, error) {
 		if err != nil {
 			return "", fmt.Errorf("failed to generate tmp dir: %w", err)
 		}
-		// We create a distinct subdirectory for payloads within the tmpdir
-		// This will typically look like /tmp/ollama3208993108/runners on linux
-		payloadsDir = filepath.Join(tmpDir, "runners")
+		payloadsDir = tmpDir
 	}
 	return payloadsDir, nil
 }
@@ -34,12 +32,10 @@ func Cleanup() {
 	lock.Lock()
 	defer lock.Unlock()
 	if payloadsDir != "" {
-		// We want to fully clean up the tmpdir parent of the payloads dir
-		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
-		slog.Debug("cleaning up", "dir", tmpDir)
-		err := os.RemoveAll(tmpDir)
+		slog.Debug("cleaning up", "dir", payloadsDir)
+		err := os.RemoveAll(payloadsDir)
 		if err != nil {
-			slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
+			slog.Warn("failed to clean up", "dir", payloadsDir, "err", err)
 		}
 	}
 }
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -155,8 +155,8 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
      }
    }

-    LOG(h.verbose, "[%d] CUDA totalMem %llu\n", i, memInfo.total);
-    LOG(h.verbose, "[%d] CUDA usedMem %llu\n", i, memInfo.used);
+    LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
+    LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.used);

    resp->total += memInfo.total;
    resp->free += memInfo.free;
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -149,7 +149,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts

 	slog.Info("Initializing llama server")
 	slog.Debug(fmt.Sprintf("server params: %+v", sparams))
-	initResp := newExtServerResp(512)
+	initResp := newExtServerResp(128)
 	defer freeExtServerResp(initResp)
 	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
 	if initResp.id < 0 {
@@ -198,9 +198,6 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu

 	if predict.Format == "json" {
 		request["grammar"] = jsonGrammar
-		if !strings.Contains(strings.ToLower(predict.Prompt), "json") {
-			slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
-		}
 	}

 	retryDelay := 100 * time.Microsecond
@@ -228,14 +225,17 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 		}

 		retryNeeded := false
-		// keep track of the last token generated, this is used to abort if the model starts looping
-		var lastToken string
-		var tokenRepeat int
 	out:
 		for {
 			select {
 			case <-ctx.Done():
-				return cancelCompletion(llm, resp)
+				// This handles the request cancellation
+				C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
+				if resp.id < 0 {
+					return extServerResponseToErr(resp)
+				} else {
+					return nil
+				}
 			default:
 				var result C.ext_server_task_result_t
 				C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
@@ -258,20 +258,6 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 					break out
 				}

-				switch {
-				case strings.TrimSpace(p.Content) == lastToken:
-					tokenRepeat++
-				default:
-					lastToken = strings.TrimSpace(p.Content)
-					tokenRepeat = 0
-				}
-
-				// 30 picked as an arbitrary max token repeat limit, modify as needed
-				if tokenRepeat > 30 {
-					slog.Debug("prediction aborted, token repeat limit reached")
-					return cancelCompletion(llm, resp)
-				}
-
 				if p.Content != "" {
 					fn(PredictResult{
 						Content: p.Content,
@@ -299,15 +285,6 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 	return fmt.Errorf("max retries exceeded")
 }

-func cancelCompletion(llm *dynExtServer, resp C.ext_server_resp_t) error {
-	C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
-	if resp.id < 0 {
-		return extServerResponseToErr(resp)
-	} else {
-		return nil
-	}
-}
-
 func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
 	data, err := json.Marshal(TokenizeRequest{Content: prompt})
 	if err != nil {
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -26,7 +26,7 @@
 #endif // GGML_USE_CUBLAS

 // Expose the llama server as a callable extern "C" API
-llama_server_context *llama = NULL;
+server_context *llama = NULL;
 std::thread ext_server_thread;
 bool shutting_down = false;
 std::atomic_int recv_counter;
@@ -57,7 +57,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
  err->id = 0;
  err->msg[0] = '\0';
  try {
-    llama = new llama_server_context;
+    llama = new server_context;
    gpt_params params;
    params.n_ctx = sparams->n_ctx;
    params.n_batch = sparams->n_batch;
@@ -114,14 +114,18 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
    llama_backend_init();
    llama_numa_init(params.numa);

-  if (!llama->load_model(params)) { 
-    // an error occurred that was not thrown
-    err->id = -1;
-    snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
-    return;
-  }
+    // load the model
+    if (!llama->load_model(params)) {
+      // TODO - consider modifying the logging logic or patching load_model so
+      // we can capture more detailed error messages and pass them back to the
+      // caller for better UX
+      err->id = -1;
+      snprintf(err->msg, err->msg_len, "error loading model %s",
+               params.model.c_str());
+      return;
+    }

-    llama->initialize();
+    llama->init();
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
@@ -140,13 +144,13 @@ void llama_server_start() {
      LOG_TEE("llama server main loop starting\n");
      ggml_time_init();
      llama->queue_tasks.on_new_task(std::bind(
-        &llama_server_context::process_single_task, llama, std::placeholders::_1));
+        &server_context::process_single_task, llama, std::placeholders::_1));
      llama->queue_tasks.on_finish_multitask(std::bind(
-        &llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
+        &server_context::on_finish_multitask, llama, std::placeholders::_1));
      llama->queue_tasks.on_run_slots(std::bind(
-        &llama_server_context::update_slots, llama));
+        &server_context::update_slots, llama));
      llama->queue_results.on_multitask_update(std::bind(
-          &llama_server_queue::update_multitask,
+          &server_queue::update_multitask,
          &llama->queue_tasks,
          std::placeholders::_1,
          std::placeholders::_2,
@@ -194,7 +198,7 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
    json data = json::parse(json_req);
    resp->id = llama->queue_tasks.get_new_id();
    llama->queue_results.add_waiting_task_id(resp->id);
-    llama->request_completion(resp->id, data, false, false, -1);
+    llama->request_completion(resp->id, -1, data, false, false);
  } catch (std::exception &e) {
    snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
  } catch (...) {
@@ -212,9 +216,9 @@ void llama_server_completion_next_result(const int task_id,
  std::string result_json;
  try {
    atomicRecv ar(recv_counter);
-    task_result result = llama->queue_results.recv(task_id);
+    server_task_result result = llama->queue_results.recv(task_id);
    result_json =
-        result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
+        result.data.dump(-1, ' ', false, json::error_handler_t::replace);
    resp->id = result.id;
    resp->stop = result.stop;
    resp->error = result.error;
@@ -359,10 +363,10 @@ void llama_server_embedding(const char *json_req, char **json_resp,
    }
    const int task_id = llama->queue_tasks.get_new_id();
    llama->queue_results.add_waiting_task_id(task_id);
-    llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
+    llama->request_completion(task_id, -1, {{"prompt", prompt}, {"n_predict", 0}}, false, true);
    atomicRecv ar(recv_counter);
-    task_result result = llama->queue_results.recv(task_id);
-    std::string result_json = result.result_json.dump();
+    server_task_result result = llama->queue_results.recv(task_id);
+    std::string result_json = result.data.dump();
    const std::string::size_type size = result_json.size() + 1;
    *json_resp = new char[size];
    snprintf(*json_resp, size, "%s", result_json.c_str());
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -18,6 +18,19 @@ sign() {
    fi
 }

+# bundle_metal bundles ggml-common.h and ggml-metal.metal into a single file
+bundle_metal() {
+    grep -v '#include "ggml-common.h"' "${LLAMACPP_DIR}/ggml-metal.metal" | grep -v '#pragma once' > "${LLAMACPP_DIR}/ggml-metal.metal.temp"
+    echo '#define GGML_COMMON_IMPL_METAL' > "${LLAMACPP_DIR}/ggml-metal.metal"
+    cat "${LLAMACPP_DIR}/ggml-common.h" | grep -v '#pragma once' >> "${LLAMACPP_DIR}/ggml-metal.metal"
+    cat  "${LLAMACPP_DIR}/ggml-metal.metal.temp" >> "${LLAMACPP_DIR}/ggml-metal.metal"
+    rm "${LLAMACPP_DIR}/ggml-metal.metal.temp"
+}
+
+cleanup_metal() {
+    (cd ${LLAMACPP_DIR} && git checkout ggml-metal.metal)
+}
+
 COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"

 case "${GOARCH}" in
@@ -63,9 +76,11 @@ case "${GOARCH}" in
    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
+    bundle_metal
    build
    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
    compress_libs
+    cleanup_metal
    ;;
 *)
    echo "GOARCH must be set"
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -185,21 +185,19 @@ if [ -d "${ROCM_PATH}" ]; then
    init_vars
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
+    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
    build

    # Record the ROCM dependencies
    rm -f "${BUILD_DIR}/lib/deps.txt"
    touch "${BUILD_DIR}/lib/deps.txt"
+
+    # having the execstack bit set on the HIP runtime sometimes causes `ldd` to error
+    execstack -c "${ROCM_PATH}/lib/libamdhip64.so*"
+
    for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
        echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt"
    done
-    # bomb out if for some reason we didn't get a few deps
-    if [ $(cat "${BUILD_DIR}/lib/deps.txt" | wc -l ) -lt 8 ] ; then
-        cat "${BUILD_DIR}/lib/deps.txt"
-        echo "ERROR: deps file short"
-        exit 1
-    fi
    compress_libs
 fi

--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/patches/01-cache.diff
+++ b/llm/patches/01-cache.diff
@@ -1,21 +1,19 @@
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index 8fe5e0b1..3e82acb9 100644
+index f255ad76..914ecfdd 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
-@@ -997,13 +997,15 @@ struct llama_server_context
-                 slot.n_sent_text += result.text_to_send.size();
+@@ -1101,12 +1101,13 @@ struct server_context {
                 // add the token to slot queue and cache
             }
+ 
 -            slot.add_token_string(result);
-+
-             if (slot.params.stream)
-             {
+             if (slot.params.stream) {
                 send_partial_response(slot, result);
             }
         }
 
 +        slot.add_token_string(result);
 +
-         if (incomplete)
-         {
+         if (incomplete) {
             slot.has_next_token = true;
+         }
--- a/llm/patches/02-cudaleaks.diff
+++ b/llm/patches/02-cudaleaks.diff
@@ -1,10 +1,10 @@
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index 8fe5e0b1..53bf39c1 100644
+index b14cca61..02bfd4b1 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
-@@ -31,6 +31,10 @@
- #include <atomic>
+@@ -29,6 +29,10 @@
 #include <signal.h>
+ #include <memory>
 
 +#ifdef GGML_USE_CUBLAS
 +extern "C" GGML_CALL void ggml_free_cublas(void);
@@ -12,8 +12,8 @@ index 8fe5e0b1..53bf39c1 100644
 +
 using json = nlohmann::json;
 
- struct server_params {
-@@ -363,6 +367,10 @@ struct llama_server_context
+ bool server_verbose = false;
+@@ -664,6 +668,10 @@ struct server_context {
             llama_free_model(model);
             model = nullptr;
         }
@@ -23,8 +23,8 @@ index 8fe5e0b1..53bf39c1 100644
 +#endif
     }
 
-     bool load_model(const gpt_params &params_)
-@@ -3543,6 +3551,7 @@ int main(int argc, char **argv)
+     bool load_model(const gpt_params & params_) {
+@@ -3499,6 +3507,7 @@ int main(int argc, char ** argv) {
     sigemptyset (&sigint_action.sa_mask);
     sigint_action.sa_flags = 0;
     sigaction(SIGINT, &sigint_action, NULL);
@@ -33,10 +33,10 @@ index 8fe5e0b1..53bf39c1 100644
     auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
         return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
 diff --git a/ggml-cuda.cu b/ggml-cuda.cu
-index 72bcec8c..6c934e8c 100644
+index c207ff87..945708a4 100644
 --- a/ggml-cuda.cu
 +++ b/ggml-cuda.cu
-@@ -43,6 +43,7 @@
+@@ -46,6 +46,7 @@
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
 #define cublasCreate hipblasCreate
@@ -44,7 +44,7 @@ index 72bcec8c..6c934e8c 100644
 #define cublasGemmEx hipblasGemmEx
 #define cublasGemmBatchedEx hipblasGemmBatchedEx
 #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-@@ -8751,10 +8752,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
+@@ -8014,10 +8015,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
     return g_cublas_loaded;
 }
 
@@ -58,7 +58,7 @@ index 72bcec8c..6c934e8c 100644
 
 #ifdef __HIP_PLATFORM_AMD__
         // Workaround for a rocBLAS bug when using multiple graphics cards:
-@@ -8764,7 +8765,7 @@ GGML_CALL void ggml_init_cublas() {
+@@ -8027,7 +8028,7 @@ GGML_CALL void ggml_init_cublas() {
 #endif
 
         if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
@@ -67,7 +67,7 @@ index 72bcec8c..6c934e8c 100644
             g_cublas_loaded = false;
             fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
             return;
-@@ -8835,7 +8836,7 @@ GGML_CALL void ggml_init_cublas() {
+@@ -8098,7 +8099,7 @@ GGML_CALL void ggml_init_cublas() {
         // configure logging to stdout
         // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
 
@@ -76,7 +76,7 @@ index 72bcec8c..6c934e8c 100644
         g_cublas_loaded = true;
     }
 }
-@@ -12490,3 +12491,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
+@@ -11753,3 +11754,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
     }
     return device_count;
 }
@@ -100,7 +100,6 @@ index 72bcec8c..6c934e8c 100644
 +
 +    g_cublas_initialized = false;
 +}
-\ No newline at end of file
 diff --git a/ggml-cuda.h b/ggml-cuda.h
 index b1ebd61d..6dd58ddf 100644
 --- a/ggml-cuda.h
--- a/llm/patches/03-load_exception.diff
+++ b/llm/patches/03-load_exception.diff
@@ -1,44 +0,0 @@
-diff --git a/llama.cpp b/llama.cpp
-index 4225f955..7b762f86 100644
--- a/llama.cpp
-+++ b/llama.cpp
-@@ -4756,7 +4756,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
-         }
-     } catch (const std::exception & err) {
-         LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
-        return -1;
-+        throw;
-     }
- 
-     return 0;
-@@ -12102,16 +12102,22 @@ struct llama_model * llama_load_model_from_file(
-         };
-     }
- 
-    int status = llama_model_load(path_model, *model, params);
-    GGML_ASSERT(status <= 0);
-    if (status < 0) {
-        if (status == -1) {
-            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
-        } else if (status == -2) {
-            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
-+    try {
-+        int status = llama_model_load(path_model, *model, params);
-+        GGML_ASSERT(status <= 0);
-+        if (status < 0) {
-+            if (status == -1) {
-+                LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
-+            } else if (status == -2) {
-+                LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
-+            }
-+            delete model;
-+            return nullptr;
-         }
-+    } catch (...) {
-+        LLAMA_LOG_ERROR("%s: exception loading model\n", __func__);
-         delete model;
-        return nullptr;
-+        throw;
-     }
- 
-     return model;
--- a/llm/patches/03-locale.diff
+++ b/llm/patches/03-locale.diff
@@ -1,10 +1,10 @@
 diff --git a/llama.cpp b/llama.cpp
-index b27aa272..99372f9c 100644
+index b19616e8..519b9602 100644
 --- a/llama.cpp
 +++ b/llama.cpp
-@@ -9360,7 +9360,7 @@ struct llm_tokenizer_wpm {
+@@ -9938,7 +9938,7 @@ struct llm_tokenizer_wpm {
     }
- 
+
     uint32_t to_lower(uint32_t code) {
 -        static const std::locale locale("en_US.UTF-8");
 +        static const std::locale locale("");
--- a/llm/patches/05-fix-clip-free.diff
+++ b/llm/patches/05-fix-clip-free.diff
@@ -1,45 +0,0 @@
-From 9192432daf90b1bfec75577434a99b4ea70d54c8 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Thu, 14 Mar 2024 12:09:50 -0700
-Subject: [PATCH] fix clip free
-
---
- examples/llava/clip.cpp    | 4 ++++
- examples/server/server.cpp | 6 ++++++
- 2 files changed, 10 insertions(+)
-
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index ef9e4ba7..b4ddfe6b 100644
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -1673,6 +1673,10 @@ void clip_free(clip_ctx * ctx) {
-     ggml_free(ctx->ctx_data);
-     gguf_free(ctx->ctx_gguf);
- 
-+    ggml_backend_buffer_free(ctx->params_buffer);
-+    ggml_backend_buffer_free(ctx->compute_buffer);
-+    ggml_backend_free(ctx->backend);
-+    ggml_gallocr_free(ctx->compute_alloc);
-     delete ctx;
- }
- 
-diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index 8fe5e0b1..f927336b 100644
--- a/examples/server/server.cpp
-+++ b/examples/server/server.cpp
-@@ -353,6 +353,12 @@ struct llama_server_context
- 
-     ~llama_server_context()
-     {
-+        if (clp_ctx)
-+        {
-+            LOG_INFO("freeing clip model", {});
-+            clip_free(clp_ctx);
-+            clp_ctx = nullptr;
-+        }
-         if (ctx)
-         {
-             llama_free(ctx);
-- 
-2.43.2
-
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -10,8 +10,8 @@ mkdir -p dist
 for TARGETARCH in arm64 amd64; do
    rm -rf llm/llama.cpp/build
    GOOS=darwin GOARCH=$TARGETARCH go generate ./...
-    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -o dist/ollama-darwin-$TARGETARCH
-    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -cover -o dist/ollama-darwin-$TARGETARCH-cov
+    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH
+    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -cover -o dist/ollama-darwin-$TARGETARCH-cov
 done

 lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -53,7 +53,7 @@ function buildOllama() {
    write-host "Building ollama CLI"
    & go generate ./...
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    & go build -trimpath -ldflags "-s -w -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
+    & go build -ldflags "-s -w -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    if ("${env:KEY_CONTAINER}") {
        & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
@@ -68,7 +68,7 @@ function buildApp() {
    write-host "Building Ollama App"
    cd "${script:SRC_DIR}\app"
    & windres -l 0 -o ollama.syso ollama.rc
-    & go build -trimpath -ldflags "-s -w -H windowsgui -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
+    & go build -ldflags "-s -w -H windowsgui -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    if ("${env:KEY_CONTAINER}") {
        & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
--- a/scripts/rh_linux_deps.sh
+++ b/scripts/rh_linux_deps.sh
@@ -9,7 +9,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then
    # Centos 7 derivatives have too old of a git version to run our generate script
    # uninstall and ignore failures
    yum remove -y git
-    yum -y install epel-release centos-release-scl
+    yum -y install epel-release centos-release-scl prelink
    yum -y install dnf
    if [ "${MACHINE}" = "x86_64" ]; then
        yum -y install https://repo.ius.io/ius-release-el7.rpm
--- a/server/routes.go
+++ b/server/routes.go
@@ -8,7 +8,6 @@ import (
 	"io"
 	"io/fs"
 	"log/slog"
-	"math"
 	"net"
 	"net/http"
 	"net/netip"
@@ -17,7 +16,6 @@ import (
 	"path/filepath"
 	"reflect"
 	"runtime"
-	"strconv"
 	"strings"
 	"sync"
 	"syscall"
@@ -209,7 +207,7 @@ func GenerateHandler(c *gin.Context) {

 	var sessionDuration time.Duration
 	if req.KeepAlive == nil {
-		sessionDuration = getDefaultSessionDuration()
+		sessionDuration = defaultSessionDuration
 	} else {
 		sessionDuration = req.KeepAlive.Duration
 	}
@@ -386,32 +384,6 @@ func GenerateHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }

-func getDefaultSessionDuration() time.Duration {
-	if t, exists := os.LookupEnv("OLLAMA_KEEP_ALIVE"); exists {
-		v, err := strconv.Atoi(t)
-		if err != nil {
-			d, err := time.ParseDuration(t)
-			if err != nil {
-				return defaultSessionDuration
-			}
-
-			if d < 0 {
-				return time.Duration(math.MaxInt64)
-			}
-
-			return d
-		}
-
-		d := time.Duration(v) * time.Second
-		if d < 0 {
-			return time.Duration(math.MaxInt64)
-		}
-		return d
-	}
-
-	return defaultSessionDuration
-}
-
 func EmbeddingsHandler(c *gin.Context) {
 	loaded.mu.Lock()
 	defer loaded.mu.Unlock()
@@ -455,7 +427,7 @@ func EmbeddingsHandler(c *gin.Context) {

 	var sessionDuration time.Duration
 	if req.KeepAlive == nil {
-		sessionDuration = getDefaultSessionDuration()
+		sessionDuration = defaultSessionDuration
 	} else {
 		sessionDuration = req.KeepAlive.Duration
 	}
@@ -1256,7 +1228,7 @@ func ChatHandler(c *gin.Context) {

 	var sessionDuration time.Duration
 	if req.KeepAlive == nil {
-		sessionDuration = getDefaultSessionDuration()
+		sessionDuration = defaultSessionDuration
 	} else {
 		sessionDuration = req.KeepAlive.Duration
 	}