sched: only error when over-allocating system memory (#5626 )

llm: dont link cuda with compat libs (#5621 )
Merge pull request #5620 from ollama/mxyng/templates
2026-01-04 13:39:28 -05:00 · 2024-07-11 00:53:12 -07:00 · 2024-07-10 20:01:52 -07:00 · 2024-07-10 17:16:24 -07:00 · 2024-07-10 17:03:08 -07:00 · 2024-07-10 17:03:08 -07:00
45 changed files with 448 additions and 334 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -147,7 +147,7 @@ jobs:
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading AMD HIP Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP"
          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP"
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -169,7 +169,7 @@ jobs:
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading AMD HIP Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP"
          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP"
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -272,4 +272,4 @@ The following server settings may be used to adjust how Ollama handles concurren
 - `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time.  The default will auto-select either 4 or 1 based on available memory.
 - `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512

-Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting.  Once ROCm v6 is available, Windows Radeon will follow the defaults above.  You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
+Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting.  Once ROCm v6.2 is available, Windows Radeon will follow the defaults above.  You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -49,9 +49,17 @@ func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 }

 func commonAMDValidateLibDir() (string, error) {
-	// We try to favor system paths first, so that we can wire up the subprocess to use
-	// the system version.  Only use our bundled version if the system version doesn't work
-	// This gives users a more recovery options if versions have subtle problems at runtime
+	// Favor our bundled version
+
+	// Installer payload location if we're running the installed binary
+	exe, err := os.Executable()
+	if err == nil {
+		rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
+		if rocmLibUsable(rocmTargetDir) {
+			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
+			return rocmTargetDir, nil
+		}
+	}

 	// Prefer explicit HIP env var
 	hipPath := os.Getenv("HIP_PATH")
@@ -87,14 +95,5 @@ func commonAMDValidateLibDir() (string, error) {
 		}
 	}

-	// Installer payload location if we're running the installed binary
-	exe, err := os.Executable()
-	if err == nil {
-		rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
-		if rocmLibUsable(rocmTargetDir) {
-			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
-			return rocmTargetDir, nil
-		}
-	}
 	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
 }
--- a/gpu/amd_hip_windows.go
+++ b/gpu/amd_hip_windows.go
@@ -84,9 +84,8 @@ func (hl *HipLib) AMDDriverVersion() (driverMajor, driverMinor int, err error) {
 	}

 	slog.Debug("hipDriverGetVersion", "version", version)
-	// TODO - this isn't actually right, but the docs claim hipDriverGetVersion isn't accurate anyway...
-	driverMajor = version / 1000
-	driverMinor = (version - (driverMajor * 1000)) / 10
+	driverMajor = version / 10000000
+	driverMinor = (version - (driverMajor * 10000000)) / 100000

 	return driverMajor, driverMinor, nil
 }
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -22,8 +22,8 @@ const (

 var (
 	// Used to validate if the given ROCm lib is usable
-	ROCmLibGlobs          = []string{"hipblas.dll", "rocblas"}                 // TODO - probably include more coverage of files here...
-	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
+	ROCmLibGlobs          = []string{"hipblas.dll", "rocblas"}                 // This is not sufficient to discern v5 vs v6
+	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
 )

 func AMDGetGPUInfo() []RocmGPUInfo {
@@ -35,12 +35,11 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 	}
 	defer hl.Release()

-	// TODO - this reports incorrect version information, so omitting for now
-	// driverMajor, driverMinor, err := hl.AMDDriverVersion()
-	// if err != nil {
-	// 	// For now this is benign, but we may eventually need to fail compatibility checks
-	// 	slog.Debug("error looking up amd driver version", "error", err)
-	// }
+	driverMajor, driverMinor, err := hl.AMDDriverVersion()
+	if err != nil {
+		// For now this is benign, but we may eventually need to fail compatibility checks
+		slog.Debug("error looking up amd driver version", "error", err)
+	}

 	// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
 	count := hl.HipGetDeviceCount()
@@ -132,10 +131,8 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 				MinimumMemory:  rocmMinimumMemory,
 				Name:           name,
 				Compute:        gfx,
-
-				// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
-				// DriverMajor:    driverMajor,
-				// DriverMinor:    driverMinor,
+				DriverMajor:    driverMajor,
+				DriverMinor:    driverMinor,
 			},
 			index: i,
 		}
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -274,6 +274,28 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor

+				// query the management library as well so we can record any skew between the two
+				// which represents overhead on the GPU we must set aside on subsequent updates
+				if cHandles.nvml != nil {
+					C.nvml_get_free(*cHandles.nvml, C.int(gpuInfo.index), &memInfo.free, &memInfo.total, &memInfo.used)
+					if memInfo.err != nil {
+						slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
+						C.free(unsafe.Pointer(memInfo.err))
+					} else {
+						if memInfo.free != 0 && uint64(memInfo.free) > gpuInfo.FreeMemory {
+							gpuInfo.OSOverhead = uint64(memInfo.free) - gpuInfo.FreeMemory
+							slog.Info("detected OS VRAM overhead",
+								"id", gpuInfo.ID,
+								"library", gpuInfo.Library,
+								"compute", gpuInfo.Compute,
+								"driver", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor),
+								"name", gpuInfo.Name,
+								"overhead", format.HumanBytes2(gpuInfo.OSOverhead),
+							)
+						}
+					}
+				}
+
 				// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
 				cudaGPUs = append(cudaGPUs, gpuInfo)
 			}
@@ -374,9 +396,14 @@ func GetGPUInfo() GpuInfoList {
 				slog.Warn("error looking up nvidia GPU memory")
 				continue
 			}
+			if cHandles.nvml != nil && gpu.OSOverhead > 0 {
+				// When using the management library update based on recorded overhead
+				memInfo.free -= C.uint64_t(gpu.OSOverhead)
+			}
 			slog.Debug("updating cuda memory data",
 				"gpu", gpu.ID,
 				"name", gpu.Name,
+				"overhead", format.HumanBytes2(gpu.OSOverhead),
 				slog.Group(
 					"before",
 					"total", format.HumanBytes2(gpu.TotalMemory),
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -52,7 +52,8 @@ type CPUInfo struct {

 type CudaGPUInfo struct {
 	GpuInfo
-	index int //nolint:unused,nolintlint
+	OSOverhead uint64 // Memory overhead between the driver library and management library
+	index      int    //nolint:unused,nolintlint
 }
 type CudaGPUInfoList []CudaGPUInfo

--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -178,7 +178,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
        echo "Building custom CUDA GPU"
    else
-        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DGGML_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} -DCMAKE_LIBRARY_PATH=/usr/local/cuda/compat"
+        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
    fi
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
@@ -254,7 +254,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
    fi
    init_vars
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
    if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
        echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -6,18 +6,9 @@ function amdGPUs {
    if ($env:AMDGPU_TARGETS) {
        return $env:AMDGPU_TARGETS
    }
-    # TODO - load from some common data file for linux + windows build consistency
+    # Current supported rocblas list from ROCm v6.1.2 on windows
    $GPU_LIST = @(
-        "gfx900"
        "gfx906:xnack-"
-        "gfx908:xnack-"
-        "gfx90a:xnack+"
-        "gfx90a:xnack-"
-        "gfx940"
-        "gfx941"
-        "gfx942"
-        "gfx1010"
-        "gfx1012"
        "gfx1030"
        "gfx1100"
        "gfx1101"
@@ -366,6 +357,7 @@ function build_rocm() {
            "-DCMAKE_C_COMPILER=clang.exe",
            "-DCMAKE_CXX_COMPILER=clang++.exe",
            "-DGGML_HIPBLAS=on",
+            "-DLLAMA_CUDA_NO_PEER_COPY=on",
            "-DHIP_PLATFORM=amd",
            "-DGGML_AVX=on",
            "-DGGML_AVX2=off",
@@ -394,7 +386,6 @@ function build_rocm() {
        sign
        install

-        # Assumes v5.7, may need adjustments for v6
        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -424,6 +424,32 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
 			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
 		)
+	case "chatglm":
+		fullOffload = 4 * batch * (embedding + vocab)
+		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
+		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
+			fullOffload = max(
+				fullOffload,
+				4*batch*(2+
+					2*embedding+
+					context+
+					context*heads+
+					embeddingHeadsK*heads+
+					qkvBias.Shape[0]),
+			)
+
+			partialOffload = max(
+				partialOffload,
+				4*batch*(1+
+					2*embedding+
+					embeddingHeadsK*heads+
+					context+
+					context*heads)+
+					4*embeddingHeadsK*context+
+					4*context*embeddingHeadsK+
+					4*qkvBias.Shape[0],
+			)
+		}
 	}

 	return
--- a/llm/server.go
+++ b/llm/server.go
@@ -122,6 +122,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 	}

+	// On linux, over-allocating CPU memory will almost always result in an error
+	if runtime.GOOS == "linux" {
+		systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
+		if systemMemoryRequired > systemTotalMemory {
+			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "system", format.HumanBytes2(systemTotalMemory))
+			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(systemTotalMemory))
+		}
+	}
+
 	estimate.log()

 	// Loop through potential servers
@@ -254,10 +263,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--tensor-split", estimate.TensorSplit)
 	}

-	if estimate.TensorSplit != "" {
-		params = append(params, "--tensor-split", estimate.TensorSplit)
-	}
-
 	for i := range len(servers) {
 		dir := availableServers[servers[i]]
 		if dir == "" {
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -338,12 +338,16 @@ func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
 	switch stop := r.Stop.(type) {
 	case string:
 		options["stop"] = []string{stop}
-	case []string:
-		options["stop"] = stop
-	default:
-		if r.Stop != nil {
-			return api.GenerateRequest{}, fmt.Errorf("invalid type for 'stop' field: %T", r.Stop)
+	case []any:
+		var stops []string
+		for _, s := range stop {
+			if str, ok := s.(string); ok {
+				stops = append(stops, str)
+			} else {
+				return api.GenerateRequest{}, fmt.Errorf("invalid type for 'stop' field: %T", s)
+			}
 		}
+		options["stop"] = stops
 	}

 	if r.MaxTokens != nil {
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@@ -3,7 +3,6 @@ package openai
 import (
 	"bytes"
 	"encoding/json"
-	"fmt"
 	"io"
 	"net/http"
 	"net/http/httptest"
@@ -16,7 +15,133 @@ import (
 	"github.com/stretchr/testify/assert"
 )

-func TestMiddleware(t *testing.T) {
+func TestMiddlewareRequests(t *testing.T) {
+	type testCase struct {
+		Name     string
+		Method   string
+		Path     string
+		Handler  func() gin.HandlerFunc
+		Setup    func(t *testing.T, req *http.Request)
+		Expected func(t *testing.T, req *http.Request)
+	}
+
+	var capturedRequest *http.Request
+
+	captureRequestMiddleware := func() gin.HandlerFunc {
+		return func(c *gin.Context) {
+			bodyBytes, _ := io.ReadAll(c.Request.Body)
+			c.Request.Body = io.NopCloser(bytes.NewReader(bodyBytes))
+			capturedRequest = c.Request
+			c.Next()
+		}
+	}
+
+	testCases := []testCase{
+		{
+			Name:    "chat handler",
+			Method:  http.MethodPost,
+			Path:    "/api/chat",
+			Handler: ChatMiddleware,
+			Setup: func(t *testing.T, req *http.Request) {
+				body := ChatCompletionRequest{
+					Model:    "test-model",
+					Messages: []Message{{Role: "user", Content: "Hello"}},
+				}
+
+				bodyBytes, _ := json.Marshal(body)
+
+				req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
+				req.Header.Set("Content-Type", "application/json")
+			},
+			Expected: func(t *testing.T, req *http.Request) {
+				var chatReq api.ChatRequest
+				if err := json.NewDecoder(req.Body).Decode(&chatReq); err != nil {
+					t.Fatal(err)
+				}
+
+				if chatReq.Messages[0].Role != "user" {
+					t.Fatalf("expected 'user', got %s", chatReq.Messages[0].Role)
+				}
+
+				if chatReq.Messages[0].Content != "Hello" {
+					t.Fatalf("expected 'Hello', got %s", chatReq.Messages[0].Content)
+				}
+			},
+		},
+		{
+			Name:    "completions handler",
+			Method:  http.MethodPost,
+			Path:    "/api/generate",
+			Handler: CompletionsMiddleware,
+			Setup: func(t *testing.T, req *http.Request) {
+				temp := float32(0.8)
+				body := CompletionRequest{
+					Model:       "test-model",
+					Prompt:      "Hello",
+					Temperature: &temp,
+					Stop:        []string{"\n", "stop"},
+				}
+
+				bodyBytes, _ := json.Marshal(body)
+
+				req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
+				req.Header.Set("Content-Type", "application/json")
+			},
+			Expected: func(t *testing.T, req *http.Request) {
+				var genReq api.GenerateRequest
+				if err := json.NewDecoder(req.Body).Decode(&genReq); err != nil {
+					t.Fatal(err)
+				}
+
+				if genReq.Prompt != "Hello" {
+					t.Fatalf("expected 'Hello', got %s", genReq.Prompt)
+				}
+
+				if genReq.Options["temperature"] != 1.6 {
+					t.Fatalf("expected 1.6, got %f", genReq.Options["temperature"])
+				}
+
+				stopTokens, ok := genReq.Options["stop"].([]any)
+
+				if !ok {
+					t.Fatalf("expected stop tokens to be a list")
+				}
+
+				if stopTokens[0] != "\n" || stopTokens[1] != "stop" {
+					t.Fatalf("expected ['\\n', 'stop'], got %v", stopTokens)
+				}
+			},
+		},
+	}
+
+	gin.SetMode(gin.TestMode)
+	router := gin.New()
+
+	endpoint := func(c *gin.Context) {
+		c.Status(http.StatusOK)
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.Name, func(t *testing.T) {
+			router = gin.New()
+			router.Use(captureRequestMiddleware())
+			router.Use(tc.Handler())
+			router.Handle(tc.Method, tc.Path, endpoint)
+			req, _ := http.NewRequest(tc.Method, tc.Path, nil)
+
+			if tc.Setup != nil {
+				tc.Setup(t, req)
+			}
+
+			resp := httptest.NewRecorder()
+			router.ServeHTTP(resp, req)
+
+			tc.Expected(t, capturedRequest)
+		})
+	}
+}
+
+func TestMiddlewareResponses(t *testing.T) {
 	type testCase struct {
 		Name     string
 		Method   string
@@ -30,159 +155,7 @@ func TestMiddleware(t *testing.T) {

 	testCases := []testCase{
 		{
-			Name:     "chat handler",
-			Method:   http.MethodPost,
-			Path:     "/api/chat",
-			TestPath: "/api/chat",
-			Handler:  ChatMiddleware,
-			Endpoint: func(c *gin.Context) {
-				var chatReq api.ChatRequest
-				if err := c.ShouldBindJSON(&chatReq); err != nil {
-					c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request"})
-					return
-				}
-
-				userMessage := chatReq.Messages[0].Content
-				var assistantMessage string
-
-				switch userMessage {
-				case "Hello":
-					assistantMessage = "Hello!"
-				default:
-					assistantMessage = "I'm not sure how to respond to that."
-				}
-
-				c.JSON(http.StatusOK, api.ChatResponse{
-					Message: api.Message{
-						Role:    "assistant",
-						Content: assistantMessage,
-					},
-				})
-			},
-			Setup: func(t *testing.T, req *http.Request) {
-				body := ChatCompletionRequest{
-					Model:    "test-model",
-					Messages: []Message{{Role: "user", Content: "Hello"}},
-				}
-
-				bodyBytes, _ := json.Marshal(body)
-
-				req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
-				req.Header.Set("Content-Type", "application/json")
-			},
-			Expected: func(t *testing.T, resp *httptest.ResponseRecorder) {
-				assert.Equal(t, http.StatusOK, resp.Code)
-
-				var chatResp ChatCompletion
-				if err := json.NewDecoder(resp.Body).Decode(&chatResp); err != nil {
-					t.Fatal(err)
-				}
-
-				if chatResp.Object != "chat.completion" {
-					t.Fatalf("expected chat.completion, got %s", chatResp.Object)
-				}
-
-				if chatResp.Choices[0].Message.Content != "Hello!" {
-					t.Fatalf("expected Hello!, got %s", chatResp.Choices[0].Message.Content)
-				}
-			},
-		},
-		{
-			Name:     "completions handler",
-			Method:   http.MethodPost,
-			Path:     "/api/generate",
-			TestPath: "/api/generate",
-			Handler:  CompletionsMiddleware,
-			Endpoint: func(c *gin.Context) {
-				c.JSON(http.StatusOK, api.GenerateResponse{
-					Response: "Hello!",
-				})
-			},
-			Setup: func(t *testing.T, req *http.Request) {
-				body := CompletionRequest{
-					Model:  "test-model",
-					Prompt: "Hello",
-				}
-
-				bodyBytes, _ := json.Marshal(body)
-
-				req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
-				req.Header.Set("Content-Type", "application/json")
-			},
-			Expected: func(t *testing.T, resp *httptest.ResponseRecorder) {
-				assert.Equal(t, http.StatusOK, resp.Code)
-				var completionResp Completion
-				if err := json.NewDecoder(resp.Body).Decode(&completionResp); err != nil {
-					t.Fatal(err)
-				}
-
-				if completionResp.Object != "text_completion" {
-					t.Fatalf("expected text_completion, got %s", completionResp.Object)
-				}
-
-				if completionResp.Choices[0].Text != "Hello!" {
-					t.Fatalf("expected Hello!, got %s", completionResp.Choices[0].Text)
-				}
-			},
-		},
-		{
-			Name:     "completions handler with params",
-			Method:   http.MethodPost,
-			Path:     "/api/generate",
-			TestPath: "/api/generate",
-			Handler:  CompletionsMiddleware,
-			Endpoint: func(c *gin.Context) {
-				var generateReq api.GenerateRequest
-				if err := c.ShouldBindJSON(&generateReq); err != nil {
-					c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request"})
-					return
-				}
-
-				temperature := generateReq.Options["temperature"].(float64)
-				var assistantMessage string
-
-				switch temperature {
-				case 1.6:
-					assistantMessage = "Received temperature of 1.6"
-				default:
-					assistantMessage = fmt.Sprintf("Received temperature of %f", temperature)
-				}
-
-				c.JSON(http.StatusOK, api.GenerateResponse{
-					Response: assistantMessage,
-				})
-			},
-			Setup: func(t *testing.T, req *http.Request) {
-				temp := float32(0.8)
-				body := CompletionRequest{
-					Model:       "test-model",
-					Prompt:      "Hello",
-					Temperature: &temp,
-				}
-
-				bodyBytes, _ := json.Marshal(body)
-
-				req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
-				req.Header.Set("Content-Type", "application/json")
-			},
-			Expected: func(t *testing.T, resp *httptest.ResponseRecorder) {
-				assert.Equal(t, http.StatusOK, resp.Code)
-				var completionResp Completion
-				if err := json.NewDecoder(resp.Body).Decode(&completionResp); err != nil {
-					t.Fatal(err)
-				}
-
-				if completionResp.Object != "text_completion" {
-					t.Fatalf("expected text_completion, got %s", completionResp.Object)
-				}
-
-				if completionResp.Choices[0].Text != "Received temperature of 1.6" {
-					t.Fatalf("expected Received temperature of 1.6, got %s", completionResp.Choices[0].Text)
-				}
-			},
-		},
-		{
-			Name:     "completions handler with error",
+			Name:     "completions handler error forwarding",
 			Method:   http.MethodPost,
 			Path:     "/api/generate",
 			TestPath: "/api/generate",
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -107,9 +107,12 @@ function gatherDependencies() {

    # TODO - this varies based on host build system and MSVC version - drive from dumpbin output
    # currently works for Win11 + MSVC 2019 + Cuda V11
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\ollama_runners\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_runners\"
    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
+    foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
+        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_runners\"
+    }


    cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -161,7 +161,7 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
 			},
 			expect: expect{
-				prompt: "You're a test, Harry! I-I'm a what? You are the Test Who Lived. A test. And a thumping good one at that, I'd wager. ",
+				prompt: "You are the Test Who Lived. You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
 			},
 		},
 	}
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -546,8 +546,8 @@ func TestCreateDetectTemplate(t *testing.T) {

 		checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{
 			filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"),
-			filepath.Join(p, "blobs", "sha256-9512c372dfc7d84d6065b8dd2b601aeed8cc1a78e7a7aa784a42fff37f5524b7"),
-			filepath.Join(p, "blobs", "sha256-b8b78cb8c6eefd14c06f1af042e6161255bf87bbf2dd14fce57cdac893db8139"),
+			filepath.Join(p, "blobs", "sha256-68b0323b2f21572bc09ba07554b16b379a5713ee48ef8c25a7661a1f71cfce77"),
+			filepath.Join(p, "blobs", "sha256-eb72fb7c550ee1f1dec4039bd65382acecf5f7536a30fb7ccace39a8d0cb590b"),
 		})
 	})

--- a/server/sched.go
+++ b/server/sched.go
@@ -135,11 +135,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
 			}

 			for {
-				cpus := s.getCpuFn()
-				var systemMem gpu.GpuInfo
-				if len(cpus) > 0 {
-					systemMem = cpus[0]
-				}
 				var runnerToExpire *runnerRef
 				s.loadedMu.Lock()
 				runner := s.loaded[pending.model.ModelPath]
@@ -193,38 +188,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}

-					estimate := llm.EstimateGPULayers(gpus, ggml, pending.model.ProjectorPaths, pending.opts)
-					maxSize := systemMem.FreeMemory
-
-					// Add available GPU memory to the total pool
-					// macOS hardware has unified memory so don't double count
-					if runtime.GOOS != "darwin" {
-						for _, gpu := range gpus {
-							if gpu.Library == "cpu" {
-								continue
-							}
-							if loadedCount == 0 {
-								// If no other models are loaded, set the limit based on what's available
-								maxSize += gpu.FreeMemory
-							} else {
-								// Other models could be unloaded, favor total memory for limit
-								maxSize += gpu.TotalMemory
-							}
-						}
-					}
-
-					// Block attempting to load a model larger than system memory + GPU memory
-					if estimate.TotalSize > maxSize {
-						slog.Warn("model request too large for system", "requested", format.HumanBytes2(estimate.TotalSize), "system", format.HumanBytes2(maxSize))
-
-						// Linux will crash if over-allocating memory - return an error to the user.
-						// TODO (jmorganca): add reasonable upper limits for darwin and windows as well
-						if runtime.GOOS == "linux" {
-							pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize))
-							break
-						}
-					}
-
 					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
 					if len(gpus) == 1 && gpus[0].Library == "cpu" {
 						// simplifying assumption of defaultParallel when in CPU mode
--- a/template/alfred.gotmpl
+++ b/template/alfred.gotmpl
@@ -3,6 +3,6 @@
 {{- end }}
 {{- range .Messages }}<start_{{ .Role }}>{{ .Content }}<end_message>
 {{- end }}<start_assistant>
-{{- else }}
+{{- else -}}
 {{ if .System }}<start_system>{{ .System }}<end_message>{{ end }}{{ if .Prompt }}<start_user>{{ .Prompt }}<end_message>{{ end }}<start_assistant>{{ .Response }}<end_message>
-{{- end }}
+{{- end -}}
--- a/template/alpaca.gotmpl
+++ b/template/alpaca.gotmpl
@@ -1,6 +1,7 @@
 {{- if .Messages }}
 {{- if .System }}{{ .System }}
-{{- end }}
+
+{{ end }}
 {{- range .Messages }}
 {{- if eq .Role "user" }}### Instruction:
 {{- else if eq .Role "assistant" }}### Response:
@@ -8,7 +9,7 @@
 {{ .Content }}

 {{ end }}### Response:
-{{ else }}
+{{ else -}}
 {{ if .System }}{{ .System }}

 {{ end }}{{ if .Prompt }}### Instruction:
@@ -16,4 +17,5 @@

 {{ end }}### Response:
 {{ .Response }}
-{{- end }}
+
+{{ end -}}
--- a/template/chatml.gotmpl
+++ b/template/chatml.gotmpl
@@ -5,11 +5,11 @@
 {{- range .Messages }}<|im_start|>{{ .Role }}
 {{ .Content }}<|im_end|>
 {{ end }}<|im_start|>assistant
-{{ else }}
+{{ else -}}
 {{ if .System }}<|im_start|>system
 {{ .System }}<|im_end|>
 {{ end }}{{ if .Prompt }}<|im_start|>user
 {{ .Prompt }}<|im_end|>
 {{ end }}<|im_start|>assistant
 {{ .Response }}<|im_end|>
-{{- end }}
+{{ end -}}
--- a/template/chatqa.gotmpl
+++ b/template/chatqa.gotmpl
@@ -8,10 +8,11 @@
 {{- end }} {{ .Content }}

 {{ end }}Assistant:
-{{- else }}
+{{- else -}}
 {{ if .System }}System: {{ .System }}

 {{ end }}{{ if .Prompt }}User: {{ .Prompt }}

-{{ end }}Assistant: <|begin_of_text|>{{ .Response }}
-{{- end }}
+{{ end }}Assistant: {{ .Response }}
+
+{{ end -}}
--- a/template/codellama-70b-instruct.gotmpl
+++ b/template/codellama-70b-instruct.gotmpl
@@ -7,13 +7,13 @@
 {{ .Content }} <step> {{ end }}Source: assistant
 Destination: user

-{{ else }}
-{{ if .System }} Source: system
+ {{ else -}}
+{{ if .System }}Source: system

- {{ .System }} <step>{{ end }} Source: user
+ {{ .System }} <step> {{ end }}Source: user

 {{ .Prompt }} <step> Source: assistant
 Destination: user

- {{ .Response }}<step>
-{{- end }}
+ {{ .Response }} <step>
+{{- end -}}
--- a/template/falcon-instruct.gotmpl
+++ b/template/falcon-instruct.gotmpl
@@ -6,8 +6,10 @@
 {{ else if eq .Role "assistant" }}Falcon:
 {{ end }}{{ .Content }}
 {{ end }}Falcon:
-{{ else }}
-{{ if .System }}{{ .System }}
-{{ end }}{{ if .Prompt }}User: {{ .Prompt }}
-{{ end }}Assistant: {{ .Response }}
-{{- end }}
+{{ else -}}
+{{ if .System }}System: {{ .System }}
+{{ end }}{{ if .Prompt }}User:
+{{ .Prompt }}
+{{ end }}Falcon:
+{{ .Response }}
+{{ end -}}
--- a/template/gemma-instruct.gotmpl
+++ b/template/gemma-instruct.gotmpl
@@ -8,9 +8,10 @@
 {{- end }}
 {{ .Content }}<end_of_turn>
 {{ end }}<start_of_turn>model
-{{ else }}
+{{ else -}}
 <start_of_turn>user
-{{ if .System }}{{ .System }} {{ end }}{{ .Prompt }}<end_of_turn>
+{{ if .System }}{{ .System }}
+{{ end }}{{ .Prompt }}<end_of_turn>
 <start_of_turn>model
 {{ .Response }}<end_of_turn>
-{{- end }}
+{{ end -}}
--- a/template/granite-instruct.gotmpl
+++ b/template/granite-instruct.gotmpl
@@ -10,9 +10,8 @@
 {{ .Content }}

 {{ end }}Answer:
-{{ else }}
-{{ if .System }}
-System:
+{{ else -}}
+{{ if .System }}System:
 {{ .System }}

 {{ end }}{{ if .Prompt }}Question:
@@ -20,4 +19,5 @@ System:

 {{ end }}Answer:
 {{ .Response }}
-{{- end }}
+
+{{ end -}}
--- a/template/llama2-chat.gotmpl
+++ b/template/llama2-chat.gotmpl
@@ -9,8 +9,8 @@
 {{- else }} [/INST] {{ .Content }}</s><s>
 {{- end }}
 {{- end }} [/INST]
-{{- else }}
-[INST] <<SYS>>{{ .System }}<</SYS>>
+{{- else -}}
+[INST] <<SYS>>{{ if .System }}{{ .System }}{{ end }}<</SYS>>

-{{ .Prompt }} [/INST] {{ .Response }}
-{{- end }}
+{{ .Prompt }} [/INST] {{ .Response }}</s>
+{{- end -}}
--- a/template/llama3-instruct.gotmpl
+++ b/template/llama3-instruct.gotmpl
@@ -8,7 +8,7 @@
 {{ .Content }}<|eot_id|>
 {{- end }}<|start_header_id|>assistant<|end_header_id|>

-{{ else }}
+{{ else -}}
 {{ if .System }}<|start_header_id|>system<|end_header_id|>

 {{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
@@ -16,4 +16,4 @@
 {{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>

 {{ .Response }}<|eot_id|>
-{{- end }}
+{{- end -}}
--- a/template/magicoder.gotmpl
+++ b/template/magicoder.gotmpl
@@ -9,7 +9,7 @@
 {{ .Content }}

 {{ end }}@@ Response
-{{ else }}
+{{ else -}}
 {{ if .System }}{{ .System }}

 {{ end }}{{ if .Prompt }}@@ Instruction
@@ -17,4 +17,5 @@

 {{ end }}@@ Response
 {{ .Response }}
-{{- end }}
+
+{{ end -}}
--- a/template/mistral-instruct.gotmpl
+++ b/template/mistral-instruct.gotmpl
@@ -5,5 +5,6 @@
 {{- else if eq .Role "assistant" }}[/INST] {{ .Content }}</s>
 {{- end }}
 {{- end }}[/INST]
-{{- else }}[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} [/INST] {{ .Response }}
-{{- end }}
+{{- else -}}
+[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }}[/INST] {{ .Response }}</s>
+{{- end -}}
--- a/template/openchat.gotmpl
+++ b/template/openchat.gotmpl
@@ -1,11 +1,11 @@
 {{- if .Messages }}
-{{- if .System }}GPT Correct System: {{ .System }}<|end_of_turn|>
+{{- if .System }}GPT4 Correct System: {{ .System }}<|end_of_turn|>
 {{- end }}
-{{- range .Messages }}GPT Correct
+{{- range .Messages }}GPT4 Correct
 {{- if eq .Role "user" }} User:
 {{- else if eq .Role "assistant" }} Assistant:
 {{- end }} {{ .Content }}<|end_of_turn|>
-{{- end }}GPT Correct Assistant:
-{{- else }}
-{{ .System }}<|end_of_turn|>GPT4 Correct User: {{ .Prompt }}<|end_of_turn|>GPT4 Correct Assistant: {{ .Response }}<|end_of_turn|>
-{{- end }}
+{{- end }}GPT4 Correct Assistant:
+{{- else -}}
+{{ if .System }}GPT4 Correct System: {{ .System }}<|end_of_turn|>{{ end }}GPT4 Correct User: {{ .Prompt }}<|end_of_turn|>GPT4 Correct Assistant: {{ .Response }}<|end_of_turn|>
+{{- end -}}
--- a/template/phi-3.gotmpl
+++ b/template/phi-3.gotmpl
@@ -5,11 +5,11 @@
 {{- range .Messages }}<|{{ .Role }}|>
 {{ .Content }}<|end|>
 {{ end }}<|assistant|>
-{{ else }}
+{{ else -}}
 {{ if .System }}<|system|>
 {{ .System }}<|end|>
 {{ end }}{{ if .Prompt }}<|user|>
 {{ .Prompt }}<|end|>
 {{ end }}<|assistant|>
 {{ .Response }}<|end|>
-{{- end }}
+{{ end -}}
--- a/template/solar-instruct.gotmpl
+++ b/template/solar-instruct.gotmpl
@@ -10,7 +10,7 @@
 {{ .Content }}</s>
 {{ end }}
 {{ end }}### Assistant:
-{{ else }}
+{{ else -}}
 {{ if .System }}### System:
 {{ .System }}

@@ -18,5 +18,6 @@
 {{ .Prompt }}

 {{ end }}### Assistant:
-{{ .Response }}
-{{- end }}
+{{ .Response }}</s>
+
+{{ end -}}
--- a/template/starcoder2-instruct.gotmpl
+++ b/template/starcoder2-instruct.gotmpl
@@ -11,14 +11,13 @@

 {{ end }}
 {{- end }}### Response
-{{ else }}
+{{ else -}}
 {{ if .System }}{{ .System }}

 {{ end }}{{ if .Prompt }}### Instruction
 {{ .Prompt }}

-
 {{ end }}### Response
 {{ .Response }}<|endoftext|>

-{{- end }}
+{{ end -}}
--- a/template/template.go
+++ b/template/template.go
@@ -143,11 +143,14 @@ func (t *Template) Vars() []string {

 type Values struct {
 	Messages []api.Message
+
+	// forceLegacy is a flag used to test compatibility with legacy templates
+	forceLegacy bool
 }

 func (t *Template) Execute(w io.Writer, v Values) error {
 	system, collated := collate(v.Messages)
-	if slices.Contains(t.Vars(), "messages") {
+	if !v.forceLegacy && slices.Contains(t.Vars(), "messages") {
 		return t.Template.Execute(w, map[string]any{
 			"System":   system,
 			"Messages": collated,
@@ -157,15 +160,19 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 	var b bytes.Buffer
 	var prompt, response string
 	for i, m := range collated {
-		if m.Role == "user" {
+		switch m.Role {
+		case "user":
 			prompt = m.Content
-		} else {
+			if i != 0 {
+				system = ""
+			}
+		case "assistant":
 			response = m.Content
 		}

 		if i != len(collated)-1 && prompt != "" && response != "" {
 			if err := t.Template.Execute(&b, map[string]any{
-				"System":   "",
+				"System":   system,
 				"Prompt":   prompt,
 				"Response": response,
 			}); err != nil {
@@ -178,18 +185,21 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 	}

 	var cut bool
-	tree := t.Template.Copy()
-	// for the last message, cut everything after "{{ .Response }}"
-	tree.Root.Nodes = slices.DeleteFunc(tree.Root.Nodes, func(n parse.Node) bool {
-		if slices.Contains(parseNode(n), "Response") {
-			cut = true
+	nodes := deleteNode(t.Template.Root.Copy(), func(n parse.Node) bool {
+		switch t := n.(type) {
+		case *parse.ActionNode:
+		case *parse.FieldNode:
+			if slices.Contains(t.Ident, "Response") {
+				cut = true
+			}
 		}

 		return cut
 	})

-	if err := template.Must(template.New("").AddParseTree("", tree)).Execute(&b, map[string]any{
-		"System": system,
+	tree := parse.Tree{Root: nodes.(*parse.ListNode)}
+	if err := template.Must(template.New("").AddParseTree("", &tree)).Execute(&b, map[string]any{
+		"System": "",
 		"Prompt": prompt,
 	}); err != nil {
 		return err
@@ -286,3 +296,72 @@ func parseNode(n parse.Node) []string {

 	return nil
 }
+
+// deleteNode walks the node list and deletes nodes that match the predicate
+// this is currently to remove the {{ .Response }} node from templates
+func deleteNode(n parse.Node, fn func(parse.Node) bool) parse.Node {
+	var walk func(n parse.Node) parse.Node
+	walk = func(n parse.Node) parse.Node {
+		if fn(n) {
+			return nil
+		}
+
+		switch t := n.(type) {
+		case *parse.ListNode:
+			var nodes []parse.Node
+			for _, c := range t.Nodes {
+				if n := walk(c); n != nil {
+					nodes = append(nodes, n)
+				}
+			}
+
+			t.Nodes = nodes
+			return t
+		case *parse.IfNode:
+			t.BranchNode = *(walk(&t.BranchNode).(*parse.BranchNode))
+		case *parse.WithNode:
+			t.BranchNode = *(walk(&t.BranchNode).(*parse.BranchNode))
+		case *parse.RangeNode:
+			t.BranchNode = *(walk(&t.BranchNode).(*parse.BranchNode))
+		case *parse.BranchNode:
+			t.List = walk(t.List).(*parse.ListNode)
+			if t.ElseList != nil {
+				t.ElseList = walk(t.ElseList).(*parse.ListNode)
+			}
+		case *parse.ActionNode:
+			n := walk(t.Pipe)
+			if n == nil {
+				return nil
+			}
+
+			t.Pipe = n.(*parse.PipeNode)
+		case *parse.PipeNode:
+			var commands []*parse.CommandNode
+			for _, c := range t.Cmds {
+				var args []parse.Node
+				for _, a := range c.Args {
+					if n := walk(a); n != nil {
+						args = append(args, n)
+					}
+				}
+
+				if len(args) == 0 {
+					return nil
+				}
+
+				c.Args = args
+				commands = append(commands, c)
+			}
+
+			if len(commands) == 0 {
+				return nil
+			}
+
+			t.Cmds = commands
+		}
+
+		return n
+	}
+
+	return walk(n)
+}
--- a/template/template_test.go
+++ b/template/template_test.go
@@ -105,8 +105,8 @@ func TestTemplate(t *testing.T) {
 			}

 			for n, tt := range cases {
+				var actual bytes.Buffer
 				t.Run(n, func(t *testing.T) {
-					var actual bytes.Buffer
 					if err := tmpl.Execute(&actual, Values{Messages: tt}); err != nil {
 						t.Fatal(err)
 					}
@@ -120,6 +120,25 @@ func TestTemplate(t *testing.T) {
 						t.Errorf("mismatch (-got +want):\n%s", diff)
 					}
 				})
+
+				t.Run("legacy", func(t *testing.T) {
+					var legacy bytes.Buffer
+					if err := tmpl.Execute(&legacy, Values{Messages: tt, forceLegacy: true}); err != nil {
+						t.Fatal(err)
+					}
+
+					legacyBytes := legacy.Bytes()
+					if slices.Contains([]string{"chatqa.gotmpl", "openchat.gotmpl", "vicuna.gotmpl"}, match) && legacyBytes[len(legacyBytes)-1] == ' ' {
+						t.Log("removing trailing space from legacy output")
+						legacyBytes = legacyBytes[:len(legacyBytes)-1]
+					} else if slices.Contains([]string{"codellama-70b-instruct.gotmpl", "llama2-chat.gotmpl", "mistral-instruct.gotmpl"}, match) {
+						t.Skip("legacy outputs cannot be compared to messages outputs")
+					}
+
+					if diff := cmp.Diff(legacyBytes, actual.Bytes()); diff != "" {
+						t.Errorf("mismatch (-got +want):\n%s", diff)
+					}
+				})
 			}
 		})
 	}
@@ -136,6 +155,21 @@ func TestParse(t *testing.T) {
 		{"{{ with .Tools }}{{ . }}{{ end }} {{ .System }} {{ .Prompt }}", []string{"prompt", "response", "system", "tools"}},
 		{"{{ range .Messages }}{{ .Role }} {{ .Content }}{{ end }}", []string{"content", "messages", "role"}},
 		{"{{ range .Messages }}{{ if eq .Role \"system\" }}SYSTEM: {{ .Content }}{{ else if eq .Role \"user\" }}USER: {{ .Content }}{{ else if eq .Role \"assistant\" }}ASSISTANT: {{ .Content }}{{ end }}{{ end }}", []string{"content", "messages", "role"}},
+		{`{{- if .Messages }}
+{{- if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}
+{{- range .Messages }}<|im_start|>{{ .Role }}
+{{ .Content }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ else -}}
+{{ if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}{{ if .Prompt }}<|im_start|>user
+{{ .Prompt }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ .Response }}<|im_end|>
+{{- end -}}`, []string{"content", "messages", "prompt", "response", "role", "system"}},
 	}

 	for _, tt := range cases {
@@ -145,9 +179,8 @@ func TestParse(t *testing.T) {
 				t.Fatal(err)
 			}

-			vars := tmpl.Vars()
-			if !slices.Equal(tt.vars, vars) {
-				t.Errorf("expected %v, got %v", tt.vars, vars)
+			if diff := cmp.Diff(tmpl.Vars(), tt.vars); diff != "" {
+				t.Errorf("mismatch (-got +want):\n%s", diff)
 			}
 		})
 	}
@@ -170,7 +203,7 @@ func TestExecuteWithMessages(t *testing.T) {
 				{"no response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] `},
 				{"response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`},
 				{"messages", `{{- range $index, $_ := .Messages }}
-{{- if eq .Role "user" }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}{{ "\n\n" }}
+{{- if eq .Role "user" }}[INST] {{ if and (eq $index 0) $.System }}{{ $.System }}{{ "\n\n" }}
 {{- end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }}
 {{- end }}
 {{- end }}`},
@@ -191,7 +224,7 @@ func TestExecuteWithMessages(t *testing.T) {
 				{"response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`},
 				{"messages", `
 {{- range $index, $_ := .Messages }}
-{{- if eq .Role "user" }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}{{ "\n\n" }}
+{{- if eq .Role "user" }}[INST] {{ if and (eq $index 0) $.System }}{{ $.System }}{{ "\n\n" }}
 {{- end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }}
 {{- end }}
 {{- end }}`},
@@ -204,9 +237,9 @@ func TestExecuteWithMessages(t *testing.T) {
 					{Role: "user", Content: "What is your name?"},
 				},
 			},
-			`[INST] Hello friend![/INST] Hello human![INST] You are a helpful assistant!
+			`[INST] You are a helpful assistant!

-What is your name?[/INST] `,
+Hello friend![/INST] Hello human![INST] What is your name?[/INST] `,
 		},
 		{
 			"chatml",
@@ -221,7 +254,7 @@ What is your name?[/INST] `,
 `},
 				{"messages", `
 {{- range $index, $_ := .Messages }}
-{{- if and (eq .Role "user") (eq (len (slice $.Messages $index)) 1) $.System }}<|im_start|>system
+{{- if and (eq .Role "user") (eq $index 0) $.System }}<|im_start|>system
 {{ $.System }}<|im_end|>{{ "\n" }}
 {{- end }}<|im_start|>{{ .Role }}
 {{ .Content }}<|im_end|>{{ "\n" }}
@@ -236,12 +269,12 @@ What is your name?[/INST] `,
 					{Role: "user", Content: "What is your name?"},
 				},
 			},
-			`<|im_start|>user
+			`<|im_start|>system
+You are a helpful assistant!<|im_end|>
+<|im_start|>user
 Hello friend!<|im_end|>
 <|im_start|>assistant
 Hello human!<|im_end|>
-<|im_start|>system
-You are a helpful assistant!<|im_end|>
 <|im_start|>user
 What is your name?<|im_end|>
 <|im_start|>assistant
@@ -300,8 +333,8 @@ Answer: `,
 						t.Fatal(err)
 					}

-					if b.String() != tt.expected {
-						t.Errorf("expected\n%s,\ngot\n%s", tt.expected, b.String())
+					if diff := cmp.Diff(b.String(), tt.expected); diff != "" {
+						t.Errorf("mismatch (-got +want):\n%s", diff)
 					}
 				})
 			}
--- a/template/testdata/alpaca.gotmpl/system-user-assistant-user
+++ b/template/testdata/alpaca.gotmpl/system-user-assistant-user
@@ -1,4 +1,6 @@
-You are a helpful assistant.### Instruction:
+You are a helpful assistant.
+
+### Instruction:
 Hello, how are you?

 ### Response:
--- a/template/testdata/codellama-70b-instruct.gotmpl/system-user-assistant-user
+++ b/template/testdata/codellama-70b-instruct.gotmpl/system-user-assistant-user
@@ -9,3 +9,4 @@ Source: system
 I'd like to show off how chat templating works! <step> Source: assistant
 Destination: user

+ 
--- a/template/testdata/codellama-70b-instruct.gotmpl/user
+++ b/template/testdata/codellama-70b-instruct.gotmpl/user
@@ -3,3 +3,4 @@ Source: user
 Hello, how are you? <step> Source: assistant
 Destination: user

+ 
--- a/template/testdata/codellama-70b-instruct.gotmpl/user-assistant-user
+++ b/template/testdata/codellama-70b-instruct.gotmpl/user-assistant-user
@@ -7,3 +7,4 @@ Source: user
 I'd like to show off how chat templating works! <step> Source: assistant
 Destination: user

+ 
--- a/template/testdata/openchat.gotmpl/system-user-assistant-user
+++ b/template/testdata/openchat.gotmpl/system-user-assistant-user
@@ -1 +1 @@
-GPT Correct System: You are a helpful assistant.<|end_of_turn|>GPT Correct User: Hello, how are you?<|end_of_turn|>GPT Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT Correct Assistant:
+GPT4 Correct System: You are a helpful assistant.<|end_of_turn|>GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT4 Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT4 Correct Assistant:
--- a/template/testdata/openchat.gotmpl/user
+++ b/template/testdata/openchat.gotmpl/user
@@ -1 +1 @@
-GPT Correct User: Hello, how are you?<|end_of_turn|>GPT Correct Assistant:
+GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant:
--- a/template/testdata/openchat.gotmpl/user-assistant-user
+++ b/template/testdata/openchat.gotmpl/user-assistant-user
@@ -1 +1 @@
-GPT Correct User: Hello, how are you?<|end_of_turn|>GPT Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT Correct Assistant:
+GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT4 Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT4 Correct Assistant:
--- a/template/vicuna.gotmpl
+++ b/template/vicuna.gotmpl
@@ -7,8 +7,9 @@
 {{ else if eq .Role "assistant" }}ASSISTANT: {{ .Content }}</s>
 {{ end }}
 {{- end }}ASSISTANT:
-{{- else }}
+{{- else -}}
 {{ if .System }}{{ .System }}
+
 {{ end }}{{ if .Prompt }}USER: {{ .Prompt }}
-{{ end }}ASSISTANT: {{ .Response }}
-{{- end }}
+{{ end }}ASSISTANT: {{ .Response }}</s>
+{{ end -}}
--- a/template/zephyr.gotmpl
+++ b/template/zephyr.gotmpl
@@ -5,11 +5,11 @@
 {{- range .Messages }}<|{{ .Role }}|>
 {{ .Content }}</s>
 {{ end }}<|assistant|>
-{{ else }}
+{{ else -}}
 {{ if .System }}<|system|>
 {{ .System }}</s>
 {{ end }}{{ if .Prompt }}<|user|>
 {{ .Prompt }}</s>
 {{ end }}<|assistant|>
 {{ .Response }}</s>
-{{- end }}
+{{ end -}}
Author	SHA1	Message	Date
Jeffrey Morgan	791650ddef	sched: only error when over-allocating system memory (#5626 )	2024-07-11 00:53:12 -07:00
Jeffrey Morgan	efbf41ed81	llm: dont link cuda with compat libs (#5621 )	2024-07-10 20:01:52 -07:00
Michael Yang	cf15589851	Merge pull request #5620 from ollama/mxyng/templates update embedded templates	2024-07-10 17:16:24 -07:00
Michael Yang	19753c18c0	update embedded templates	2024-07-10 17:03:08 -07:00
Michael Yang	41be28096a	add system prompt to first legacy template	2024-07-10 17:03:08 -07:00
Michael Yang	37a570f962	Merge pull request #5612 from ollama/mxyng/mem chatglm graph	2024-07-10 14:18:33 -07:00
Michael Yang	5a739ff4cb	chatglm graph	2024-07-10 13:43:47 -07:00
Jeffrey Morgan	4e262eb2a8	remove `GGML_CUDA_FORCE_MMQ=on` from build (#5588 )	2024-07-10 13:17:13 -07:00
Daniel Hiltgen	4cfcbc328f	Merge pull request #5124 from dhiltgen/amd_windows Wire up windows AMD driver reporting	2024-07-10 12:50:23 -07:00
Daniel Hiltgen	79292ff3e0	Merge pull request #5555 from dhiltgen/msvc_deps Bundle missing CRT libraries	2024-07-10 12:50:02 -07:00
Daniel Hiltgen	8ea500441d	Merge pull request #5580 from dhiltgen/cuda_overhead Detect CUDA OS overhead	2024-07-10 12:47:31 -07:00
Daniel Hiltgen	b50c818623	Merge pull request #5607 from dhiltgen/win_rocm_v6 Bump ROCm on windows to 6.1.2	2024-07-10 12:47:10 -07:00
Daniel Hiltgen	b99e750b62	Merge pull request #5605 from dhiltgen/merge_glitch Remove duplicate merge glitch	2024-07-10 11:47:08 -07:00
Daniel Hiltgen	1f50356e8e	Bump ROCm on windows to 6.1.2 This also adjusts our algorithm to favor our bundled ROCm. I've confirmed VRAM reporting still doesn't work properly so we can't yet enable concurrency by default.	2024-07-10 11:01:22 -07:00
Daniel Hiltgen	22c81f62ec	Remove duplicate merge glitch	2024-07-10 09:01:33 -07:00
Daniel Hiltgen	2d1e3c3229	Merge pull request #5503 from dhiltgen/dual_rocm Workaround broken ROCm p2p copy	2024-07-09 15:44:16 -07:00
royjhan	4918fae535	OpenAI v1/completions: allow stop token list (#5551 ) * stop token parsing fix * add stop test	2024-07-09 14:01:26 -07:00
royjhan	0aff67877e	separate request tests (#5578 )	2024-07-09 13:48:31 -07:00
Daniel Hiltgen	f6f759fc5f	Detect CUDA OS Overhead This adds logic to detect skew between the driver and management library which can be attributed to OS overhead and records that so we can adjust subsequent management library free VRAM updates and avoid OOM scenarios.	2024-07-09 12:21:50 -07:00
Daniel Hiltgen	b44320db13	Bundle missing CRT libraries Some users are experienging runner startup errors due to not having these msvc redist libraries on their host	2024-07-08 18:24:21 -07:00
Daniel Hiltgen	0bacb30007	Workaround broken ROCm p2p copy Enable the build flag for llama.cpp to use CPU copy for multi-GPU scenarios.	2024-07-08 09:40:52 -07:00
Daniel Hiltgen	784bf88b0d	Wire up windows AMD driver reporting This seems to be ROCm version, not actually driver version, but it may be useful for toggling logic for VRAM reporting in the future	2024-06-18 16:22:47 -07:00