sched: only error when over-allocating system memory (#5626 )

llm: dont link cuda with compat libs (#5621 )
Merge pull request #5620 from ollama/mxyng/templates
2026-01-05 14:11:04 -05:00 · 2024-07-11 00:53:12 -07:00 · 2024-07-10 20:01:52 -07:00 · 2024-07-10 17:16:24 -07:00 · 2024-07-10 17:03:08 -07:00 · 2024-07-10 17:03:08 -07:00
33 changed files with 250 additions and 126 deletions
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -178,7 +178,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
        echo "Building custom CUDA GPU"
    else
-        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DGGML_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} -DCMAKE_LIBRARY_PATH=/usr/local/cuda/compat"
+        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
    fi
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -424,6 +424,32 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
 			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
 		)
+	case "chatglm":
+		fullOffload = 4 * batch * (embedding + vocab)
+		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
+		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
+			fullOffload = max(
+				fullOffload,
+				4*batch*(2+
+					2*embedding+
+					context+
+					context*heads+
+					embeddingHeadsK*heads+
+					qkvBias.Shape[0]),
+			)
+
+			partialOffload = max(
+				partialOffload,
+				4*batch*(1+
+					2*embedding+
+					embeddingHeadsK*heads+
+					context+
+					context*heads)+
+					4*embeddingHeadsK*context+
+					4*context*embeddingHeadsK+
+					4*qkvBias.Shape[0],
+			)
+		}
 	}

 	return
--- a/llm/server.go
+++ b/llm/server.go
@@ -122,6 +122,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 	}

+	// On linux, over-allocating CPU memory will almost always result in an error
+	if runtime.GOOS == "linux" {
+		systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
+		if systemMemoryRequired > systemTotalMemory {
+			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "system", format.HumanBytes2(systemTotalMemory))
+			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(systemTotalMemory))
+		}
+	}
+
 	estimate.log()

 	// Loop through potential servers
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -161,7 +161,7 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
 			},
 			expect: expect{
-				prompt: "You're a test, Harry! I-I'm a what? You are the Test Who Lived. A test. And a thumping good one at that, I'd wager. ",
+				prompt: "You are the Test Who Lived. You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
 			},
 		},
 	}
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -546,8 +546,8 @@ func TestCreateDetectTemplate(t *testing.T) {

 		checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{
 			filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"),
-			filepath.Join(p, "blobs", "sha256-9512c372dfc7d84d6065b8dd2b601aeed8cc1a78e7a7aa784a42fff37f5524b7"),
-			filepath.Join(p, "blobs", "sha256-b8b78cb8c6eefd14c06f1af042e6161255bf87bbf2dd14fce57cdac893db8139"),
+			filepath.Join(p, "blobs", "sha256-68b0323b2f21572bc09ba07554b16b379a5713ee48ef8c25a7661a1f71cfce77"),
+			filepath.Join(p, "blobs", "sha256-eb72fb7c550ee1f1dec4039bd65382acecf5f7536a30fb7ccace39a8d0cb590b"),
 		})
 	})

--- a/server/sched.go
+++ b/server/sched.go
@@ -135,11 +135,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
 			}

 			for {
-				cpus := s.getCpuFn()
-				var systemMem gpu.GpuInfo
-				if len(cpus) > 0 {
-					systemMem = cpus[0]
-				}
 				var runnerToExpire *runnerRef
 				s.loadedMu.Lock()
 				runner := s.loaded[pending.model.ModelPath]
@@ -193,38 +188,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}

-					estimate := llm.EstimateGPULayers(gpus, ggml, pending.model.ProjectorPaths, pending.opts)
-					maxSize := systemMem.FreeMemory
-
-					// Add available GPU memory to the total pool
-					// macOS hardware has unified memory so don't double count
-					if runtime.GOOS != "darwin" {
-						for _, gpu := range gpus {
-							if gpu.Library == "cpu" {
-								continue
-							}
-							if loadedCount == 0 {
-								// If no other models are loaded, set the limit based on what's available
-								maxSize += gpu.FreeMemory
-							} else {
-								// Other models could be unloaded, favor total memory for limit
-								maxSize += gpu.TotalMemory
-							}
-						}
-					}
-
-					// Block attempting to load a model larger than system memory + GPU memory
-					if estimate.TotalSize > maxSize {
-						slog.Warn("model request too large for system", "requested", format.HumanBytes2(estimate.TotalSize), "system", format.HumanBytes2(maxSize))
-
-						// Linux will crash if over-allocating memory - return an error to the user.
-						// TODO (jmorganca): add reasonable upper limits for darwin and windows as well
-						if runtime.GOOS == "linux" {
-							pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize))
-							break
-						}
-					}
-
 					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
 					if len(gpus) == 1 && gpus[0].Library == "cpu" {
 						// simplifying assumption of defaultParallel when in CPU mode
--- a/template/alfred.gotmpl
+++ b/template/alfred.gotmpl
@@ -3,6 +3,6 @@
 {{- end }}
 {{- range .Messages }}<start_{{ .Role }}>{{ .Content }}<end_message>
 {{- end }}<start_assistant>
-{{- else }}
+{{- else -}}
 {{ if .System }}<start_system>{{ .System }}<end_message>{{ end }}{{ if .Prompt }}<start_user>{{ .Prompt }}<end_message>{{ end }}<start_assistant>{{ .Response }}<end_message>
-{{- end }}
+{{- end -}}
--- a/template/alpaca.gotmpl
+++ b/template/alpaca.gotmpl
@@ -1,6 +1,7 @@
 {{- if .Messages }}
 {{- if .System }}{{ .System }}
-{{- end }}
+
+{{ end }}
 {{- range .Messages }}
 {{- if eq .Role "user" }}### Instruction:
 {{- else if eq .Role "assistant" }}### Response:
@@ -8,7 +9,7 @@
 {{ .Content }}

 {{ end }}### Response:
-{{ else }}
+{{ else -}}
 {{ if .System }}{{ .System }}

 {{ end }}{{ if .Prompt }}### Instruction:
@@ -16,4 +17,5 @@

 {{ end }}### Response:
 {{ .Response }}
-{{- end }}
+
+{{ end -}}
--- a/template/chatml.gotmpl
+++ b/template/chatml.gotmpl
@@ -5,11 +5,11 @@
 {{- range .Messages }}<|im_start|>{{ .Role }}
 {{ .Content }}<|im_end|>
 {{ end }}<|im_start|>assistant
-{{ else }}
+{{ else -}}
 {{ if .System }}<|im_start|>system
 {{ .System }}<|im_end|>
 {{ end }}{{ if .Prompt }}<|im_start|>user
 {{ .Prompt }}<|im_end|>
 {{ end }}<|im_start|>assistant
 {{ .Response }}<|im_end|>
-{{- end }}
+{{ end -}}
--- a/template/chatqa.gotmpl
+++ b/template/chatqa.gotmpl
@@ -8,10 +8,11 @@
 {{- end }} {{ .Content }}

 {{ end }}Assistant:
-{{- else }}
+{{- else -}}
 {{ if .System }}System: {{ .System }}

 {{ end }}{{ if .Prompt }}User: {{ .Prompt }}

-{{ end }}Assistant: <|begin_of_text|>{{ .Response }}
-{{- end }}
+{{ end }}Assistant: {{ .Response }}
+
+{{ end -}}
--- a/template/codellama-70b-instruct.gotmpl
+++ b/template/codellama-70b-instruct.gotmpl
@@ -7,13 +7,13 @@
 {{ .Content }} <step> {{ end }}Source: assistant
 Destination: user

-{{ else }}
-{{ if .System }} Source: system
+ {{ else -}}
+{{ if .System }}Source: system

- {{ .System }} <step>{{ end }} Source: user
+ {{ .System }} <step> {{ end }}Source: user

 {{ .Prompt }} <step> Source: assistant
 Destination: user

- {{ .Response }}<step>
-{{- end }}
+ {{ .Response }} <step>
+{{- end -}}
--- a/template/falcon-instruct.gotmpl
+++ b/template/falcon-instruct.gotmpl
@@ -6,8 +6,10 @@
 {{ else if eq .Role "assistant" }}Falcon:
 {{ end }}{{ .Content }}
 {{ end }}Falcon:
-{{ else }}
-{{ if .System }}{{ .System }}
-{{ end }}{{ if .Prompt }}User: {{ .Prompt }}
-{{ end }}Assistant: {{ .Response }}
-{{- end }}
+{{ else -}}
+{{ if .System }}System: {{ .System }}
+{{ end }}{{ if .Prompt }}User:
+{{ .Prompt }}
+{{ end }}Falcon:
+{{ .Response }}
+{{ end -}}
--- a/template/gemma-instruct.gotmpl
+++ b/template/gemma-instruct.gotmpl
@@ -8,9 +8,10 @@
 {{- end }}
 {{ .Content }}<end_of_turn>
 {{ end }}<start_of_turn>model
-{{ else }}
+{{ else -}}
 <start_of_turn>user
-{{ if .System }}{{ .System }} {{ end }}{{ .Prompt }}<end_of_turn>
+{{ if .System }}{{ .System }}
+{{ end }}{{ .Prompt }}<end_of_turn>
 <start_of_turn>model
 {{ .Response }}<end_of_turn>
-{{- end }}
+{{ end -}}
--- a/template/granite-instruct.gotmpl
+++ b/template/granite-instruct.gotmpl
@@ -10,9 +10,8 @@
 {{ .Content }}

 {{ end }}Answer:
-{{ else }}
-{{ if .System }}
-System:
+{{ else -}}
+{{ if .System }}System:
 {{ .System }}

 {{ end }}{{ if .Prompt }}Question:
@@ -20,4 +19,5 @@ System:

 {{ end }}Answer:
 {{ .Response }}
-{{- end }}
+
+{{ end -}}
--- a/template/llama2-chat.gotmpl
+++ b/template/llama2-chat.gotmpl
@@ -9,8 +9,8 @@
 {{- else }} [/INST] {{ .Content }}</s><s>
 {{- end }}
 {{- end }} [/INST]
-{{- else }}
-[INST] <<SYS>>{{ .System }}<</SYS>>
+{{- else -}}
+[INST] <<SYS>>{{ if .System }}{{ .System }}{{ end }}<</SYS>>

-{{ .Prompt }} [/INST] {{ .Response }}
-{{- end }}
+{{ .Prompt }} [/INST] {{ .Response }}</s>
+{{- end -}}
--- a/template/llama3-instruct.gotmpl
+++ b/template/llama3-instruct.gotmpl
@@ -8,7 +8,7 @@
 {{ .Content }}<|eot_id|>
 {{- end }}<|start_header_id|>assistant<|end_header_id|>

-{{ else }}
+{{ else -}}
 {{ if .System }}<|start_header_id|>system<|end_header_id|>

 {{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
@@ -16,4 +16,4 @@
 {{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>

 {{ .Response }}<|eot_id|>
-{{- end }}
+{{- end -}}
--- a/template/magicoder.gotmpl
+++ b/template/magicoder.gotmpl
@@ -9,7 +9,7 @@
 {{ .Content }}

 {{ end }}@@ Response
-{{ else }}
+{{ else -}}
 {{ if .System }}{{ .System }}

 {{ end }}{{ if .Prompt }}@@ Instruction
@@ -17,4 +17,5 @@

 {{ end }}@@ Response
 {{ .Response }}
-{{- end }}
+
+{{ end -}}
--- a/template/mistral-instruct.gotmpl
+++ b/template/mistral-instruct.gotmpl
@@ -5,5 +5,6 @@
 {{- else if eq .Role "assistant" }}[/INST] {{ .Content }}</s>
 {{- end }}
 {{- end }}[/INST]
-{{- else }}[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} [/INST] {{ .Response }}
-{{- end }}
+{{- else -}}
+[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }}[/INST] {{ .Response }}</s>
+{{- end -}}
--- a/template/openchat.gotmpl
+++ b/template/openchat.gotmpl
@@ -1,11 +1,11 @@
 {{- if .Messages }}
-{{- if .System }}GPT Correct System: {{ .System }}<|end_of_turn|>
+{{- if .System }}GPT4 Correct System: {{ .System }}<|end_of_turn|>
 {{- end }}
-{{- range .Messages }}GPT Correct
+{{- range .Messages }}GPT4 Correct
 {{- if eq .Role "user" }} User:
 {{- else if eq .Role "assistant" }} Assistant:
 {{- end }} {{ .Content }}<|end_of_turn|>
-{{- end }}GPT Correct Assistant:
-{{- else }}
-{{ .System }}<|end_of_turn|>GPT4 Correct User: {{ .Prompt }}<|end_of_turn|>GPT4 Correct Assistant: {{ .Response }}<|end_of_turn|>
-{{- end }}
+{{- end }}GPT4 Correct Assistant:
+{{- else -}}
+{{ if .System }}GPT4 Correct System: {{ .System }}<|end_of_turn|>{{ end }}GPT4 Correct User: {{ .Prompt }}<|end_of_turn|>GPT4 Correct Assistant: {{ .Response }}<|end_of_turn|>
+{{- end -}}
--- a/template/phi-3.gotmpl
+++ b/template/phi-3.gotmpl
@@ -5,11 +5,11 @@
 {{- range .Messages }}<|{{ .Role }}|>
 {{ .Content }}<|end|>
 {{ end }}<|assistant|>
-{{ else }}
+{{ else -}}
 {{ if .System }}<|system|>
 {{ .System }}<|end|>
 {{ end }}{{ if .Prompt }}<|user|>
 {{ .Prompt }}<|end|>
 {{ end }}<|assistant|>
 {{ .Response }}<|end|>
-{{- end }}
+{{ end -}}
--- a/template/solar-instruct.gotmpl
+++ b/template/solar-instruct.gotmpl
@@ -10,7 +10,7 @@
 {{ .Content }}</s>
 {{ end }}
 {{ end }}### Assistant:
-{{ else }}
+{{ else -}}
 {{ if .System }}### System:
 {{ .System }}

@@ -18,5 +18,6 @@
 {{ .Prompt }}

 {{ end }}### Assistant:
-{{ .Response }}
-{{- end }}
+{{ .Response }}</s>
+
+{{ end -}}
--- a/template/starcoder2-instruct.gotmpl
+++ b/template/starcoder2-instruct.gotmpl
@@ -11,14 +11,13 @@

 {{ end }}
 {{- end }}### Response
-{{ else }}
+{{ else -}}
 {{ if .System }}{{ .System }}

 {{ end }}{{ if .Prompt }}### Instruction
 {{ .Prompt }}

-
 {{ end }}### Response
 {{ .Response }}<|endoftext|>

-{{- end }}
+{{ end -}}
--- a/template/template.go
+++ b/template/template.go
@@ -143,11 +143,14 @@ func (t *Template) Vars() []string {

 type Values struct {
 	Messages []api.Message
+
+	// forceLegacy is a flag used to test compatibility with legacy templates
+	forceLegacy bool
 }

 func (t *Template) Execute(w io.Writer, v Values) error {
 	system, collated := collate(v.Messages)
-	if slices.Contains(t.Vars(), "messages") {
+	if !v.forceLegacy && slices.Contains(t.Vars(), "messages") {
 		return t.Template.Execute(w, map[string]any{
 			"System":   system,
 			"Messages": collated,
@@ -157,15 +160,19 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 	var b bytes.Buffer
 	var prompt, response string
 	for i, m := range collated {
-		if m.Role == "user" {
+		switch m.Role {
+		case "user":
 			prompt = m.Content
-		} else {
+			if i != 0 {
+				system = ""
+			}
+		case "assistant":
 			response = m.Content
 		}

 		if i != len(collated)-1 && prompt != "" && response != "" {
 			if err := t.Template.Execute(&b, map[string]any{
-				"System":   "",
+				"System":   system,
 				"Prompt":   prompt,
 				"Response": response,
 			}); err != nil {
@@ -178,18 +185,21 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 	}

 	var cut bool
-	tree := t.Template.Copy()
-	// for the last message, cut everything after "{{ .Response }}"
-	tree.Root.Nodes = slices.DeleteFunc(tree.Root.Nodes, func(n parse.Node) bool {
-		if slices.Contains(parseNode(n), "Response") {
-			cut = true
+	nodes := deleteNode(t.Template.Root.Copy(), func(n parse.Node) bool {
+		switch t := n.(type) {
+		case *parse.ActionNode:
+		case *parse.FieldNode:
+			if slices.Contains(t.Ident, "Response") {
+				cut = true
+			}
 		}

 		return cut
 	})

-	if err := template.Must(template.New("").AddParseTree("", tree)).Execute(&b, map[string]any{
-		"System": system,
+	tree := parse.Tree{Root: nodes.(*parse.ListNode)}
+	if err := template.Must(template.New("").AddParseTree("", &tree)).Execute(&b, map[string]any{
+		"System": "",
 		"Prompt": prompt,
 	}); err != nil {
 		return err
@@ -286,3 +296,72 @@ func parseNode(n parse.Node) []string {

 	return nil
 }
+
+// deleteNode walks the node list and deletes nodes that match the predicate
+// this is currently to remove the {{ .Response }} node from templates
+func deleteNode(n parse.Node, fn func(parse.Node) bool) parse.Node {
+	var walk func(n parse.Node) parse.Node
+	walk = func(n parse.Node) parse.Node {
+		if fn(n) {
+			return nil
+		}
+
+		switch t := n.(type) {
+		case *parse.ListNode:
+			var nodes []parse.Node
+			for _, c := range t.Nodes {
+				if n := walk(c); n != nil {
+					nodes = append(nodes, n)
+				}
+			}
+
+			t.Nodes = nodes
+			return t
+		case *parse.IfNode:
+			t.BranchNode = *(walk(&t.BranchNode).(*parse.BranchNode))
+		case *parse.WithNode:
+			t.BranchNode = *(walk(&t.BranchNode).(*parse.BranchNode))
+		case *parse.RangeNode:
+			t.BranchNode = *(walk(&t.BranchNode).(*parse.BranchNode))
+		case *parse.BranchNode:
+			t.List = walk(t.List).(*parse.ListNode)
+			if t.ElseList != nil {
+				t.ElseList = walk(t.ElseList).(*parse.ListNode)
+			}
+		case *parse.ActionNode:
+			n := walk(t.Pipe)
+			if n == nil {
+				return nil
+			}
+
+			t.Pipe = n.(*parse.PipeNode)
+		case *parse.PipeNode:
+			var commands []*parse.CommandNode
+			for _, c := range t.Cmds {
+				var args []parse.Node
+				for _, a := range c.Args {
+					if n := walk(a); n != nil {
+						args = append(args, n)
+					}
+				}
+
+				if len(args) == 0 {
+					return nil
+				}
+
+				c.Args = args
+				commands = append(commands, c)
+			}
+
+			if len(commands) == 0 {
+				return nil
+			}
+
+			t.Cmds = commands
+		}
+
+		return n
+	}
+
+	return walk(n)
+}
--- a/template/template_test.go
+++ b/template/template_test.go
@@ -105,8 +105,8 @@ func TestTemplate(t *testing.T) {
 			}

 			for n, tt := range cases {
+				var actual bytes.Buffer
 				t.Run(n, func(t *testing.T) {
-					var actual bytes.Buffer
 					if err := tmpl.Execute(&actual, Values{Messages: tt}); err != nil {
 						t.Fatal(err)
 					}
@@ -120,6 +120,25 @@ func TestTemplate(t *testing.T) {
 						t.Errorf("mismatch (-got +want):\n%s", diff)
 					}
 				})
+
+				t.Run("legacy", func(t *testing.T) {
+					var legacy bytes.Buffer
+					if err := tmpl.Execute(&legacy, Values{Messages: tt, forceLegacy: true}); err != nil {
+						t.Fatal(err)
+					}
+
+					legacyBytes := legacy.Bytes()
+					if slices.Contains([]string{"chatqa.gotmpl", "openchat.gotmpl", "vicuna.gotmpl"}, match) && legacyBytes[len(legacyBytes)-1] == ' ' {
+						t.Log("removing trailing space from legacy output")
+						legacyBytes = legacyBytes[:len(legacyBytes)-1]
+					} else if slices.Contains([]string{"codellama-70b-instruct.gotmpl", "llama2-chat.gotmpl", "mistral-instruct.gotmpl"}, match) {
+						t.Skip("legacy outputs cannot be compared to messages outputs")
+					}
+
+					if diff := cmp.Diff(legacyBytes, actual.Bytes()); diff != "" {
+						t.Errorf("mismatch (-got +want):\n%s", diff)
+					}
+				})
 			}
 		})
 	}
@@ -136,6 +155,21 @@ func TestParse(t *testing.T) {
 		{"{{ with .Tools }}{{ . }}{{ end }} {{ .System }} {{ .Prompt }}", []string{"prompt", "response", "system", "tools"}},
 		{"{{ range .Messages }}{{ .Role }} {{ .Content }}{{ end }}", []string{"content", "messages", "role"}},
 		{"{{ range .Messages }}{{ if eq .Role \"system\" }}SYSTEM: {{ .Content }}{{ else if eq .Role \"user\" }}USER: {{ .Content }}{{ else if eq .Role \"assistant\" }}ASSISTANT: {{ .Content }}{{ end }}{{ end }}", []string{"content", "messages", "role"}},
+		{`{{- if .Messages }}
+{{- if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}
+{{- range .Messages }}<|im_start|>{{ .Role }}
+{{ .Content }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ else -}}
+{{ if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}{{ if .Prompt }}<|im_start|>user
+{{ .Prompt }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ .Response }}<|im_end|>
+{{- end -}}`, []string{"content", "messages", "prompt", "response", "role", "system"}},
 	}

 	for _, tt := range cases {
@@ -145,9 +179,8 @@ func TestParse(t *testing.T) {
 				t.Fatal(err)
 			}

-			vars := tmpl.Vars()
-			if !slices.Equal(tt.vars, vars) {
-				t.Errorf("expected %v, got %v", tt.vars, vars)
+			if diff := cmp.Diff(tmpl.Vars(), tt.vars); diff != "" {
+				t.Errorf("mismatch (-got +want):\n%s", diff)
 			}
 		})
 	}
@@ -170,7 +203,7 @@ func TestExecuteWithMessages(t *testing.T) {
 				{"no response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] `},
 				{"response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`},
 				{"messages", `{{- range $index, $_ := .Messages }}
-{{- if eq .Role "user" }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}{{ "\n\n" }}
+{{- if eq .Role "user" }}[INST] {{ if and (eq $index 0) $.System }}{{ $.System }}{{ "\n\n" }}
 {{- end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }}
 {{- end }}
 {{- end }}`},
@@ -191,7 +224,7 @@ func TestExecuteWithMessages(t *testing.T) {
 				{"response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`},
 				{"messages", `
 {{- range $index, $_ := .Messages }}
-{{- if eq .Role "user" }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}{{ "\n\n" }}
+{{- if eq .Role "user" }}[INST] {{ if and (eq $index 0) $.System }}{{ $.System }}{{ "\n\n" }}
 {{- end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }}
 {{- end }}
 {{- end }}`},
@@ -204,9 +237,9 @@ func TestExecuteWithMessages(t *testing.T) {
 					{Role: "user", Content: "What is your name?"},
 				},
 			},
-			`[INST] Hello friend![/INST] Hello human![INST] You are a helpful assistant!
+			`[INST] You are a helpful assistant!

-What is your name?[/INST] `,
+Hello friend![/INST] Hello human![INST] What is your name?[/INST] `,
 		},
 		{
 			"chatml",
@@ -221,7 +254,7 @@ What is your name?[/INST] `,
 `},
 				{"messages", `
 {{- range $index, $_ := .Messages }}
-{{- if and (eq .Role "user") (eq (len (slice $.Messages $index)) 1) $.System }}<|im_start|>system
+{{- if and (eq .Role "user") (eq $index 0) $.System }}<|im_start|>system
 {{ $.System }}<|im_end|>{{ "\n" }}
 {{- end }}<|im_start|>{{ .Role }}
 {{ .Content }}<|im_end|>{{ "\n" }}
@@ -236,12 +269,12 @@ What is your name?[/INST] `,
 					{Role: "user", Content: "What is your name?"},
 				},
 			},
-			`<|im_start|>user
+			`<|im_start|>system
+You are a helpful assistant!<|im_end|>
+<|im_start|>user
 Hello friend!<|im_end|>
 <|im_start|>assistant
 Hello human!<|im_end|>
-<|im_start|>system
-You are a helpful assistant!<|im_end|>
 <|im_start|>user
 What is your name?<|im_end|>
 <|im_start|>assistant
@@ -300,8 +333,8 @@ Answer: `,
 						t.Fatal(err)
 					}

-					if b.String() != tt.expected {
-						t.Errorf("expected\n%s,\ngot\n%s", tt.expected, b.String())
+					if diff := cmp.Diff(b.String(), tt.expected); diff != "" {
+						t.Errorf("mismatch (-got +want):\n%s", diff)
 					}
 				})
 			}
--- a/template/testdata/alpaca.gotmpl/system-user-assistant-user
+++ b/template/testdata/alpaca.gotmpl/system-user-assistant-user
@@ -1,4 +1,6 @@
-You are a helpful assistant.### Instruction:
+You are a helpful assistant.
+
+### Instruction:
 Hello, how are you?

 ### Response:
--- a/template/testdata/codellama-70b-instruct.gotmpl/system-user-assistant-user
+++ b/template/testdata/codellama-70b-instruct.gotmpl/system-user-assistant-user
@@ -9,3 +9,4 @@ Source: system
 I'd like to show off how chat templating works! <step> Source: assistant
 Destination: user

+ 
--- a/template/testdata/codellama-70b-instruct.gotmpl/user
+++ b/template/testdata/codellama-70b-instruct.gotmpl/user
@@ -3,3 +3,4 @@ Source: user
 Hello, how are you? <step> Source: assistant
 Destination: user

+ 
--- a/template/testdata/codellama-70b-instruct.gotmpl/user-assistant-user
+++ b/template/testdata/codellama-70b-instruct.gotmpl/user-assistant-user
@@ -7,3 +7,4 @@ Source: user
 I'd like to show off how chat templating works! <step> Source: assistant
 Destination: user

+ 
--- a/template/testdata/openchat.gotmpl/system-user-assistant-user
+++ b/template/testdata/openchat.gotmpl/system-user-assistant-user
@@ -1 +1 @@
-GPT Correct System: You are a helpful assistant.<|end_of_turn|>GPT Correct User: Hello, how are you?<|end_of_turn|>GPT Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT Correct Assistant:
+GPT4 Correct System: You are a helpful assistant.<|end_of_turn|>GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT4 Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT4 Correct Assistant:
--- a/template/testdata/openchat.gotmpl/user
+++ b/template/testdata/openchat.gotmpl/user
@@ -1 +1 @@
-GPT Correct User: Hello, how are you?<|end_of_turn|>GPT Correct Assistant:
+GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant:
--- a/template/testdata/openchat.gotmpl/user-assistant-user
+++ b/template/testdata/openchat.gotmpl/user-assistant-user
@@ -1 +1 @@
-GPT Correct User: Hello, how are you?<|end_of_turn|>GPT Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT Correct Assistant:
+GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT4 Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT4 Correct Assistant:
--- a/template/vicuna.gotmpl
+++ b/template/vicuna.gotmpl
@@ -7,8 +7,9 @@
 {{ else if eq .Role "assistant" }}ASSISTANT: {{ .Content }}</s>
 {{ end }}
 {{- end }}ASSISTANT:
-{{- else }}
+{{- else -}}
 {{ if .System }}{{ .System }}
+
 {{ end }}{{ if .Prompt }}USER: {{ .Prompt }}
-{{ end }}ASSISTANT: {{ .Response }}
-{{- end }}
+{{ end }}ASSISTANT: {{ .Response }}</s>
+{{ end -}}
--- a/template/zephyr.gotmpl
+++ b/template/zephyr.gotmpl
@@ -5,11 +5,11 @@
 {{- range .Messages }}<|{{ .Role }}|>
 {{ .Content }}</s>
 {{ end }}<|assistant|>
-{{ else }}
+{{ else -}}
 {{ if .System }}<|system|>
 {{ .System }}</s>
 {{ end }}{{ if .Prompt }}<|user|>
 {{ .Prompt }}</s>
 {{ end }}<|assistant|>
 {{ .Response }}</s>
-{{- end }}
+{{ end -}}
Author	SHA1	Message	Date
Jeffrey Morgan	791650ddef	sched: only error when over-allocating system memory (#5626 )	2024-07-11 00:53:12 -07:00
Jeffrey Morgan	efbf41ed81	llm: dont link cuda with compat libs (#5621 )	2024-07-10 20:01:52 -07:00
Michael Yang	cf15589851	Merge pull request #5620 from ollama/mxyng/templates update embedded templates	2024-07-10 17:16:24 -07:00
Michael Yang	19753c18c0	update embedded templates	2024-07-10 17:03:08 -07:00
Michael Yang	41be28096a	add system prompt to first legacy template	2024-07-10 17:03:08 -07:00
Michael Yang	37a570f962	Merge pull request #5612 from ollama/mxyng/mem chatglm graph	2024-07-10 14:18:33 -07:00
Michael Yang	5a739ff4cb	chatglm graph	2024-07-10 13:43:47 -07:00
Jeffrey Morgan	4e262eb2a8	remove `GGML_CUDA_FORCE_MMQ=on` from build (#5588 )	2024-07-10 13:17:13 -07:00