pass the template to the /api/chat endpoint

2026-01-03 13:10:17 -05:00 · 2024-07-10 14:17:39 -07:00
48 changed files with 397 additions and 351 deletions
--- a/api/types.go
+++ b/api/types.go
@@ -84,6 +84,9 @@ type ChatRequest struct {
 	// Model is the model name, as in [GenerateRequest].
 	Model string `json:"model"`

+	// Template overrides the model's default prompt template.
+	Template string `json:"template"`
+
 	// Messages is the messages of the chat - can be used to keep a chat memory.
 	Messages []Message `json:"messages"`

--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -127,10 +127,6 @@ Type: filesandordirs; Name: "{%USERPROFILE}\.ollama\models"
 Type: filesandordirs; Name: "{%USERPROFILE}\.ollama\history"
 ; NOTE: if the user has a custom OLLAMA_MODELS it will be preserved

-[InstallDelete]
-Type: filesandordirs; Name: "{%TEMP}\ollama*"
-Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama"
-
 [Messages]
 WizardReady=Ollama Windows Preview
 ReadyLabel1=%nLet's get you up and running with your own large language models.
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -947,6 +947,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {

 	req := &api.ChatRequest{
 		Model:    opts.Model,
+		Template: opts.Template,
 		Messages: opts.Messages,
 		Format:   opts.Format,
 		Options:  opts.Options,
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -18,6 +18,7 @@ import (
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/readline"
+	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/errtypes"
 )

@@ -205,9 +206,17 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 				fmt.Println("Set system message.")
 				sb.Reset()
 			case MultilineTemplate:
-				opts.Template = sb.String()
-				fmt.Println("Set prompt template.")
+				mTemplate := sb.String()
 				sb.Reset()
+				_, err := template.Parse(mTemplate)
+				if err != nil {
+					multiline = MultilineNone
+					scanner.Prompt.UseAlt = false
+					fmt.Println("The template is invalid.")
+					continue
+				}
+				opts.Template = mTemplate
+				fmt.Println("Set prompt template.")
 			}

 			multiline = MultilineNone
@@ -369,9 +378,15 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 						fmt.Println("Set system message.")
 						sb.Reset()
 					} else if args[1] == "template" {
-						opts.Template = sb.String()
-						fmt.Println("Set prompt template.")
+						mTemplate := sb.String()
 						sb.Reset()
+						_, err := template.Parse(mTemplate)
+						if err != nil {
+							fmt.Println("The template is invalid.")
+							continue
+						}
+						opts.Template = mTemplate
+						fmt.Println("Set prompt template.")
 					}

 					sb.Reset()
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -360,17 +360,14 @@ func GetGPUInfo() GpuInfoList {
 					"before",
 					"total", format.HumanBytes2(cpus[0].TotalMemory),
 					"free", format.HumanBytes2(cpus[0].FreeMemory),
-					"free_swap", format.HumanBytes2(cpus[0].FreeSwap),
 				),
 				slog.Group(
 					"now",
 					"total", format.HumanBytes2(mem.TotalMemory),
 					"free", format.HumanBytes2(mem.FreeMemory),
-					"free_swap", format.HumanBytes2(mem.FreeSwap),
 				),
 			)
 			cpus[0].FreeMemory = mem.FreeMemory
-			cpus[0].FreeSwap = mem.FreeSwap
 		}

 		var memInfo C.mem_info_t
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -57,7 +57,6 @@ func GetCPUMem() (memInfo, error) {
 	return memInfo{
 		TotalMemory: uint64(C.getPhysicalMemory()),
 		FreeMemory:  uint64(C.getFreeMemory()),
-		// FreeSwap omitted as Darwin uses dynamic paging
 	}, nil
 }

--- a/gpu/gpu_linux.go
+++ b/gpu/gpu_linux.go
@@ -50,7 +50,7 @@ var OneapiMgmtName = "libze_intel_gpu.so"

 func GetCPUMem() (memInfo, error) {
 	var mem memInfo
-	var total, available, free, buffers, cached, freeSwap uint64
+	var total, available, free, buffers, cached uint64
 	f, err := os.Open("/proc/meminfo")
 	if err != nil {
 		return mem, err
@@ -70,21 +70,20 @@ func GetCPUMem() (memInfo, error) {
 			_, err = fmt.Sscanf(line, "Buffers:%d", &buffers)
 		case strings.HasPrefix(line, "Cached:"):
 			_, err = fmt.Sscanf(line, "Cached:%d", &cached)
-		case strings.HasPrefix(line, "SwapFree:"):
-			_, err = fmt.Sscanf(line, "SwapFree:%d", &freeSwap)
 		default:
 			continue
 		}
 		if err != nil {
 			return mem, err
 		}
+
+		if total > 0 && available > 0 {
+			mem.TotalMemory = total * format.KibiByte
+			mem.FreeMemory = available * format.KibiByte
+			return mem, nil
+		}
 	}
 	mem.TotalMemory = total * format.KibiByte
-	mem.FreeSwap = freeSwap * format.KibiByte
-	if available > 0 {
-		mem.FreeMemory = available * format.KibiByte
-	} else {
-		mem.FreeMemory = (free + buffers + cached) * format.KibiByte
-	}
+	mem.FreeMemory = (free + buffers + cached) * format.KibiByte
 	return mem, nil
 }
--- a/gpu/gpu_windows.go
+++ b/gpu/gpu_windows.go
@@ -51,5 +51,5 @@ func GetCPUMem() (memInfo, error) {
 	if r1 == 0 {
 		return memInfo{}, fmt.Errorf("GlobalMemoryStatusEx failed: %w", err)
 	}
-	return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys, FreeSwap: memStatus.AvailPageFile}, nil
+	return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys}, nil
 }
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -10,7 +10,6 @@ import (
 type memInfo struct {
 	TotalMemory uint64 `json:"total_memory,omitempty"`
 	FreeMemory  uint64 `json:"free_memory,omitempty"`
-	FreeSwap    uint64 `json:"free_swap,omitempty"`
 }

 // Beginning of an `ollama info` command
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -178,7 +178,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
        echo "Building custom CUDA GPU"
    else
-        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
+        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DGGML_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} -DCMAKE_LIBRARY_PATH=/usr/local/cuda/compat"
    fi
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -424,32 +424,6 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
 			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
 		)
-	case "chatglm":
-		fullOffload = 4 * batch * (embedding + vocab)
-		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
-		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
-			fullOffload = max(
-				fullOffload,
-				4*batch*(2+
-					2*embedding+
-					context+
-					context*heads+
-					embeddingHeadsK*heads+
-					qkvBias.Shape[0]),
-			)
-
-			partialOffload = max(
-				partialOffload,
-				4*batch*(1+
-					2*embedding+
-					embeddingHeadsK*heads+
-					context+
-					context*heads)+
-					4*embeddingHeadsK*context+
-					4*context*embeddingHeadsK+
-					4*qkvBias.Shape[0],
-			)
-		}
 	}

 	return
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -33,7 +33,7 @@ func Quantize(infile, outfile string, ftype fileType) error {
 	params.ftype = ftype.Value()

 	if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
-		return fmt.Errorf("failed to quantize model. This model architecture may not be supported, or you may need to upgrade Ollama to the latest version")
+		return fmt.Errorf("llama_model_quantize: %d", rc)
 	}

 	return nil
--- a/llm/server.go
+++ b/llm/server.go
@@ -88,7 +88,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	var estimate MemoryEstimate
 	var systemTotalMemory uint64
 	var systemFreeMemory uint64
-	var systemSwapFreeMemory uint64

 	systemMemInfo, err := gpu.GetCPUMem()
 	if err != nil {
@@ -96,8 +95,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	} else {
 		systemTotalMemory = systemMemInfo.TotalMemory
 		systemFreeMemory = systemMemInfo.FreeMemory
-		systemSwapFreeMemory = systemMemInfo.FreeSwap
-		slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
+		slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory)
 	}

 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
@@ -124,16 +122,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 	}

-	// On linux, over-allocating CPU memory will almost always result in an error
-	if runtime.GOOS == "linux" {
-		systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
-		available := min(systemTotalMemory, systemFreeMemory+systemSwapFreeMemory)
-		if systemMemoryRequired > available {
-			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory))
-			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
-		}
-	}
-
 	estimate.log()

 	// Loop through potential servers
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"context"
 	"log/slog"
+	"slices"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llm"
@@ -16,18 +17,26 @@ type tokenizeFunc func(context.Context, string) ([]int, error)
 // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
 // latest message and 2) system messages
 func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message) (prompt string, images []llm.ImageData, _ error) {
+	// pull out any system messages which should always be included in the prompt
 	var system []api.Message
+	msgs = slices.DeleteFunc(msgs, func(m api.Message) bool {
+		if m.Role == "system" {
+			system = append(system, m)
+			return true
+		}
+
+		return false
+	})
+
+	if len(system) == 0 && m.System != "" {
+		// add model system prompt since it wasn't provided
+		system = append(system, api.Message{Role: "system", Content: m.System})
+	}
+
 	// always include the last message
 	n := len(msgs) - 1
 	// in reverse, find all messages that fit into context window
 	for i := n - 1; i >= 0; i-- {
-		system = make([]api.Message, 0)
-		for j := range i {
-			if msgs[j].Role == "system" {
-				system = append(system, msgs[j])
-			}
-		}
-
 		var b bytes.Buffer
 		if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...)}); err != nil {
 			return "", nil, err
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -6,7 +6,6 @@ import (
 	"strings"
 	"testing"

-	"github.com/google/go-cmp/cmp"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/template"
 )
@@ -161,19 +160,6 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "assistant", Content: "I-I'm a what?"},
 				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
 			},
-			expect: expect{
-				prompt: "You are the Test Who Lived. You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
-			},
-		},
-		{
-			name:  "out of order system",
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "You're a test, Harry!"},
-				{Role: "assistant", Content: "I-I'm a what?"},
-				{Role: "system", Content: "You are the Test Who Lived."},
-				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
-			},
 			expect: expect{
 				prompt: "You're a test, Harry! I-I'm a what? You are the Test Who Lived. A test. And a thumping good one at that, I'd wager. ",
 			},
@@ -201,10 +187,6 @@ func TestChatPrompt(t *testing.T) {
 				t.Errorf("expected %q, got %q", tt.prompt, prompt)
 			}

-			if diff := cmp.Diff(prompt, tt.prompt); diff != "" {
-				t.Errorf("mismatch (-got +want):\n%s", diff)
-			}
-
 			if len(images) != len(tt.images) {
 				t.Fatalf("expected %d images, got %d", len(tt.images), len(images))
 			}
--- a/server/routes.go
+++ b/server/routes.go
@@ -71,7 +71,7 @@ func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options

 // scheduleRunner schedules a runner after validating inputs such as capabilities and model options.
 // It returns the allocated runner, model instance, and consolidated options if successful and error otherwise.
-func (s *Server) scheduleRunner(ctx context.Context, name string, caps []Capability, requestOpts map[string]any, keepAlive *api.Duration) (llm.LlamaServer, *Model, *api.Options, error) {
+func (s *Server) scheduleRunner(ctx context.Context, name string, mTemplate string, caps []Capability, requestOpts map[string]any, keepAlive *api.Duration) (llm.LlamaServer, *Model, *api.Options, error) {
 	if name == "" {
 		return nil, nil, nil, fmt.Errorf("model %w", errRequired)
 	}
@@ -81,6 +81,13 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []Capabil
 		return nil, nil, nil, err
 	}

+	if mTemplate != "" {
+		model.Template, err = template.Parse(mTemplate)
+		if err != nil {
+			return nil, nil, nil, err
+		}
+	}
+
 	if err := model.CheckCapabilities(caps...); err != nil {
 		return nil, nil, nil, fmt.Errorf("%s %w", name, err)
 	}
@@ -120,7 +127,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	}

 	caps := []Capability{CapabilityCompletion}
-	r, m, opts, err := s.scheduleRunner(c.Request.Context(), req.Model, caps, req.Options, req.KeepAlive)
+	r, m, opts, err := s.scheduleRunner(c.Request.Context(), req.Model, "", caps, req.Options, req.KeepAlive)
 	if errors.Is(err, errCapabilityCompletion) {
 		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support generate", req.Model)})
 		return
@@ -256,7 +263,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 		return
 	}

-	r, _, _, err := s.scheduleRunner(c.Request.Context(), req.Model, []Capability{}, req.Options, req.KeepAlive)
+	r, _, _, err := s.scheduleRunner(c.Request.Context(), req.Model, "", []Capability{}, req.Options, req.KeepAlive)
 	if err != nil {
 		handleScheduleError(c, req.Model, err)
 		return
@@ -1132,7 +1139,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	}

 	caps := []Capability{CapabilityCompletion}
-	r, m, opts, err := s.scheduleRunner(c.Request.Context(), req.Model, caps, req.Options, req.KeepAlive)
+	r, m, opts, err := s.scheduleRunner(c.Request.Context(), req.Model, req.Template, caps, req.Options, req.KeepAlive)
 	if errors.Is(err, errCapabilityCompletion) {
 		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support chat", req.Model)})
 		return
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -546,8 +546,8 @@ func TestCreateDetectTemplate(t *testing.T) {

 		checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{
 			filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"),
-			filepath.Join(p, "blobs", "sha256-c608dc615584cd20d9d830363dabf8a4783ae5d34245c3d8c115edb3bc7b28e4"),
-			filepath.Join(p, "blobs", "sha256-f836ee110db21567f826332e4cedd746c06d10664fd5a9ea3659e3683a944510"),
+			filepath.Join(p, "blobs", "sha256-9512c372dfc7d84d6065b8dd2b601aeed8cc1a78e7a7aa784a42fff37f5524b7"),
+			filepath.Join(p, "blobs", "sha256-b8b78cb8c6eefd14c06f1af042e6161255bf87bbf2dd14fce57cdac893db8139"),
 		})
 	})

--- a/server/sched.go
+++ b/server/sched.go
@@ -135,6 +135,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 			}

 			for {
+				cpus := s.getCpuFn()
+				var systemMem gpu.GpuInfo
+				if len(cpus) > 0 {
+					systemMem = cpus[0]
+				}
 				var runnerToExpire *runnerRef
 				s.loadedMu.Lock()
 				runner := s.loaded[pending.model.ModelPath]
@@ -188,6 +193,38 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}

+					estimate := llm.EstimateGPULayers(gpus, ggml, pending.model.ProjectorPaths, pending.opts)
+					maxSize := systemMem.FreeMemory
+
+					// Add available GPU memory to the total pool
+					// macOS hardware has unified memory so don't double count
+					if runtime.GOOS != "darwin" {
+						for _, gpu := range gpus {
+							if gpu.Library == "cpu" {
+								continue
+							}
+							if loadedCount == 0 {
+								// If no other models are loaded, set the limit based on what's available
+								maxSize += gpu.FreeMemory
+							} else {
+								// Other models could be unloaded, favor total memory for limit
+								maxSize += gpu.TotalMemory
+							}
+						}
+					}
+
+					// Block attempting to load a model larger than system memory + GPU memory
+					if estimate.TotalSize > maxSize {
+						slog.Warn("model request too large for system", "requested", format.HumanBytes2(estimate.TotalSize), "system", format.HumanBytes2(maxSize))
+
+						// Linux will crash if over-allocating memory - return an error to the user.
+						// TODO (jmorganca): add reasonable upper limits for darwin and windows as well
+						if runtime.GOOS == "linux" {
+							pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize))
+							break
+						}
+					}
+
 					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
 					if len(gpus) == 1 && gpus[0].Library == "cpu" {
 						// simplifying assumption of defaultParallel when in CPU mode
--- a/template/alfred.gotmpl
+++ b/template/alfred.gotmpl
@@ -1 +1,8 @@
-{{ if .System }}<start_system>{{ .System }}<end_message>{{ end }}{{ if .Prompt }}<start_user>{{ .Prompt }}<end_message>{{ end }}<start_assistant>{{ .Response }}<end_message>
+{{- if .Messages }}
+{{- if .System }}<start_system>{{ .System }}<end_message>
+{{- end }}
+{{- range .Messages }}<start_{{ .Role }}>{{ .Content }}<end_message>
+{{- end }}<start_assistant>
+{{- else }}
+{{ if .System }}<start_system>{{ .System }}<end_message>{{ end }}{{ if .Prompt }}<start_user>{{ .Prompt }}<end_message>{{ end }}<start_assistant>{{ .Response }}<end_message>
+{{- end }}
--- a/template/alpaca.gotmpl
+++ b/template/alpaca.gotmpl
@@ -1,3 +1,14 @@
+{{- if .Messages }}
+{{- if .System }}{{ .System }}
+{{- end }}
+{{- range .Messages }}
+{{- if eq .Role "user" }}### Instruction:
+{{- else if eq .Role "assistant" }}### Response:
+{{- end }}
+{{ .Content }}
+
+{{ end }}### Response:
+{{ else }}
 {{ if .System }}{{ .System }}

 {{ end }}{{ if .Prompt }}### Instruction:
@@ -5,4 +16,4 @@

 {{ end }}### Response:
 {{ .Response }}
-
+{{- end }}
--- a/template/chatml.gotmpl
+++ b/template/chatml.gotmpl
@@ -1,6 +1,15 @@
+{{- if .Messages }}
+{{- if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}
+{{- range .Messages }}<|im_start|>{{ .Role }}
+{{ .Content }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ else }}
 {{ if .System }}<|im_start|>system
 {{ .System }}<|im_end|>
 {{ end }}{{ if .Prompt }}<|im_start|>user
 {{ .Prompt }}<|im_end|>
 {{ end }}<|im_start|>assistant
 {{ .Response }}<|im_end|>
+{{- end }}
--- a/template/chatqa.gotmpl
+++ b/template/chatqa.gotmpl
@@ -1,6 +1,17 @@
+{{- if .Messages }}
+{{- if .System }}System: {{ .System }}
+
+{{ end }}
+{{- range .Messages }}
+{{- if eq .Role "user" }}User:
+{{- else if eq .Role "assistant" }}Assistant:
+{{- end }} {{ .Content }}
+
+{{ end }}Assistant:
+{{- else }}
 {{ if .System }}System: {{ .System }}

 {{ end }}{{ if .Prompt }}User: {{ .Prompt }}

-{{ end }}Assistant: {{ .Response }}
-
+{{ end }}Assistant: <|begin_of_text|>{{ .Response }}
+{{- end }}
--- a/template/codellama-70b-instruct.gotmpl
+++ b/template/codellama-70b-instruct.gotmpl
@@ -1,10 +1,19 @@
-{{ if .System }}Source: system
+{{- if .Messages }}
+{{- if .System }}Source: system

- {{ .System }} <step> {{ end }}Source: user
+ {{ .System }} <step> {{ end }}
+{{- range .Messages }}Source: {{ .Role }}
+
+ {{ .Content }} <step> {{ end }}Source: assistant
+Destination: user
+
+{{ else }}
+{{ if .System }} Source: system
+
+ {{ .System }} <step>{{ end }} Source: user

 {{ .Prompt }} <step> Source: assistant
-{{- if not .Response }}
 Destination: user
-{{- end }}

- {{ .Response }} <step> 
+ {{ .Response }}<step>
+{{- end }}
--- a/template/falcon-instruct.gotmpl
+++ b/template/falcon-instruct.gotmpl
@@ -1,5 +1,13 @@
-{{ if .System }}System: {{ .System }}
-{{ end }}{{ if .Prompt }}User:
-{{ .Prompt }}
+{{- if .Messages }}
+{{- if .System }}System: {{ .System }}
+{{ end }}
+{{- range .Messages }}
+{{- if eq .Role "user" }}User:
+{{ else if eq .Role "assistant" }}Falcon:
+{{ end }}{{ .Content }}
 {{ end }}Falcon:
-{{ .Response }}
+{{ else }}
+{{ if .System }}{{ .System }}
+{{ end }}{{ if .Prompt }}User: {{ .Prompt }}
+{{ end }}Assistant: {{ .Response }}
+{{- end }}
--- a/template/gemma-instruct.gotmpl
+++ b/template/gemma-instruct.gotmpl
@@ -1,5 +1,16 @@
+{{- if .Messages }}
+{{- range $index, $_ := .Messages }}<start_of_turn>
+{{- if eq .Role "user" }}user
+{{- if and $.System (eq $index 0) }}
+{{ $.System }}
+{{- end }}
+{{- else if eq .Role "assistant" }}model
+{{- end }}
+{{ .Content }}<end_of_turn>
+{{ end }}<start_of_turn>model
+{{ else }}
 <start_of_turn>user
-{{ if .System }}{{ .System }}
-{{ end }}{{ .Prompt }}<end_of_turn>
+{{ if .System }}{{ .System }} {{ end }}{{ .Prompt }}<end_of_turn>
 <start_of_turn>model
 {{ .Response }}<end_of_turn>
+{{- end }}
--- a/template/granite-instruct.gotmpl
+++ b/template/granite-instruct.gotmpl
@@ -1,4 +1,18 @@
-{{ if .System }}System:
+{{- if .Messages }}
+{{- if .System }}System:
+{{ .System }}
+
+{{ end }}
+{{- range .Messages }}
+{{- if eq .Role "user" }}Question:
+{{- else if eq .Role "assistant" }}Answer:
+{{- end }}
+{{ .Content }}
+
+{{ end }}Answer:
+{{ else }}
+{{ if .System }}
+System:
 {{ .System }}

 {{ end }}{{ if .Prompt }}Question:
@@ -6,4 +20,4 @@

 {{ end }}Answer:
 {{ .Response }}
-
+{{- end }}
--- a/template/llama2-chat.gotmpl
+++ b/template/llama2-chat.gotmpl
@@ -1,6 +1,16 @@
-[INST] <<SYS>>
-{{- if .System }}
-{{ .System }}
+{{- if .Messages }}
+{{- range $index, $_ := .Messages }}
+{{- if eq .Role "user" }}[INST] {{ if eq $index 0 }}<<SYS>>
+{{- if $.System }}
+{{ $.System }}
 {{ end }}<</SYS>>

-{{ .Prompt }} [/INST] {{ .Response }}</s><s>
+{{ end }}{{ .Content }}
+{{- else }} [/INST] {{ .Content }}</s><s>
+{{- end }}
+{{- end }} [/INST]
+{{- else }}
+[INST] <<SYS>>{{ .System }}<</SYS>>
+
+{{ .Prompt }} [/INST] {{ .Response }}
+{{- end }}
--- a/template/llama3-instruct.gotmpl
+++ b/template/llama3-instruct.gotmpl
@@ -1,7 +1,19 @@
+{{- if .Messages }}
+{{- if .System }}<|start_header_id|>system<|end_header_id|>
+
+{{ .System }}<|eot_id|>
+{{- end }}
+{{- range .Messages }}<|start_header_id|>{{ .Role }}<|end_header_id|>
+
+{{ .Content }}<|eot_id|>
+{{- end }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ else }}
 {{ if .System }}<|start_header_id|>system<|end_header_id|>

 {{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>

 {{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>

-{{ .Response }}<|eot_id|>
+{{ .Response }}<|eot_id|>
+{{- end }}
--- a/template/magicoder.gotmpl
+++ b/template/magicoder.gotmpl
@@ -1,3 +1,15 @@
+{{- if .Messages }}
+{{- if .System }}{{ .System }}
+
+{{ end }}
+{{- range .Messages }}
+{{- if eq .Role "user" }}@@ Instruction
+{{- else if eq .Role "assistant" }}@@ Response
+{{- end }}
+{{ .Content }}
+
+{{ end }}@@ Response
+{{ else }}
 {{ if .System }}{{ .System }}

 {{ end }}{{ if .Prompt }}@@ Instruction
@@ -5,4 +17,4 @@

 {{ end }}@@ Response
 {{ .Response }}
-
+{{- end }}
--- a/template/mistral-instruct.gotmpl
+++ b/template/mistral-instruct.gotmpl
@@ -1,3 +1,9 @@
-[INST] {{ if .System }}{{ .System }}
-
-{{ end }}{{ .Prompt }}[/INST] {{ .Response }}</s>
+{{- if .Messages }}
+{{- range $index, $_ := .Messages }}
+{{- if eq .Role "user" }}[INST] {{ if and $.System (eq (len (slice $.Messages $index)) 1) }}{{ $.System }}
+{{ end }}{{ .Content }}
+{{- else if eq .Role "assistant" }}[/INST] {{ .Content }}</s>
+{{- end }}
+{{- end }}[/INST]
+{{- else }}[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} [/INST] {{ .Response }}
+{{- end }}
--- a/template/openchat.gotmpl
+++ b/template/openchat.gotmpl
@@ -1 +1,11 @@
-{{ if .System }}GPT4 Correct System: {{ .System }}<|end_of_turn|>{{ end }}GPT4 Correct User: {{ .Prompt }}<|end_of_turn|>GPT4 Correct Assistant: {{ .Response }}<|end_of_turn|>
+{{- if .Messages }}
+{{- if .System }}GPT Correct System: {{ .System }}<|end_of_turn|>
+{{- end }}
+{{- range .Messages }}GPT Correct
+{{- if eq .Role "user" }} User:
+{{- else if eq .Role "assistant" }} Assistant:
+{{- end }} {{ .Content }}<|end_of_turn|>
+{{- end }}GPT Correct Assistant:
+{{- else }}
+{{ .System }}<|end_of_turn|>GPT4 Correct User: {{ .Prompt }}<|end_of_turn|>GPT4 Correct Assistant: {{ .Response }}<|end_of_turn|>
+{{- end }}
--- a/template/phi-3.gotmpl
+++ b/template/phi-3.gotmpl
@@ -1,6 +1,15 @@
+{{- if .Messages }}
+{{- if .System }}<|system|>
+{{ .System }}<|end|>
+{{ end }}
+{{- range .Messages }}<|{{ .Role }}|>
+{{ .Content }}<|end|>
+{{ end }}<|assistant|>
+{{ else }}
 {{ if .System }}<|system|>
 {{ .System }}<|end|>
 {{ end }}{{ if .Prompt }}<|user|>
 {{ .Prompt }}<|end|>
 {{ end }}<|assistant|>
 {{ .Response }}<|end|>
+{{- end }}
--- a/template/solar-instruct.gotmpl
+++ b/template/solar-instruct.gotmpl
@@ -1,3 +1,16 @@
+{{- if .Messages }}
+{{- if .System }}### System:
+{{ .System }}
+
+{{ end }}
+{{- range .Messages }}
+{{- if eq .Role "user" }}### User:
+{{ .Content }}
+{{ else if eq .Role "assistant" }}### Assistant:
+{{ .Content }}</s>
+{{ end }}
+{{ end }}### Assistant:
+{{ else }}
 {{ if .System }}### System:
 {{ .System }}

@@ -5,5 +18,5 @@
 {{ .Prompt }}

 {{ end }}### Assistant:
-{{ .Response }}</s>
-
+{{ .Response }}
+{{- end }}
--- a/template/starcoder2-instruct.gotmpl
+++ b/template/starcoder2-instruct.gotmpl
@@ -1,8 +1,24 @@
+{{- if .Messages }}
+{{- if .System }}{{ .System }}
+
+{{ end }}
+{{- range .Messages }}
+{{- if eq .Role "user" }}### Instruction
+{{ .Content }}
+
+{{ else if eq .Role "assistant" }}### Response
+{{ .Content }}<|endoftext|>
+
+{{ end }}
+{{- end }}### Response
+{{ else }}
 {{ if .System }}{{ .System }}

 {{ end }}{{ if .Prompt }}### Instruction
 {{ .Prompt }}

+
 {{ end }}### Response
 {{ .Response }}<|endoftext|>

+{{- end }}
--- a/template/template.go
+++ b/template/template.go
@@ -143,74 +143,52 @@ func (t *Template) Vars() []string {

 type Values struct {
 	Messages []api.Message
-
-	// forceLegacy is a flag used to test compatibility with legacy templates
-	forceLegacy bool
 }

 func (t *Template) Execute(w io.Writer, v Values) error {
-	system, messages := collate(v.Messages)
-	if !v.forceLegacy && slices.Contains(t.Vars(), "messages") {
+	system, collated := collate(v.Messages)
+	if slices.Contains(t.Vars(), "messages") {
 		return t.Template.Execute(w, map[string]any{
 			"System":   system,
-			"Messages": messages,
+			"Messages": collated,
 		})
 	}

-	system = ""
 	var b bytes.Buffer
 	var prompt, response string
-	for _, m := range messages {
-		execute := func () error {
+	for i, m := range collated {
+		if m.Role == "user" {
+			prompt = m.Content
+		} else {
+			response = m.Content
+		}
+
+		if i != len(collated)-1 && prompt != "" && response != "" {
 			if err := t.Template.Execute(&b, map[string]any{
-				"System":   system,
+				"System":   "",
 				"Prompt":   prompt,
 				"Response": response,
 			}); err != nil {
 				return err
 			}

-			system = ""
 			prompt = ""
 			response = ""
-			return nil
-		}
-
-		switch m.Role {
-		case "system":
-			if prompt != "" || response != "" {
-				if err := execute(); err != nil {
-					return err
-				}
-			}
-			system = m.Content
-		case "user":
-			if response != "" {
-				if err := execute(); err != nil {
-					return err
-				}
-			}
-			prompt = m.Content
-		case "assistant":
-			response = m.Content
 		}
 	}

 	var cut bool
-	nodes := deleteNode(t.Template.Root.Copy(), func(n parse.Node) bool {
-		switch t := n.(type) {
-		case *parse.ActionNode:
-		case *parse.FieldNode:
-			if slices.Contains(t.Ident, "Response") {
-				cut = true
-			}
+	tree := t.Template.Copy()
+	// for the last message, cut everything after "{{ .Response }}"
+	tree.Root.Nodes = slices.DeleteFunc(tree.Root.Nodes, func(n parse.Node) bool {
+		if slices.Contains(parseNode(n), "Response") {
+			cut = true
 		}

 		return cut
 	})

-	tree := parse.Tree{Root: nodes.(*parse.ListNode)}
-	if err := template.Must(template.New("").AddParseTree("", &tree)).Execute(&b, map[string]any{
+	if err := template.Must(template.New("").AddParseTree("", tree)).Execute(&b, map[string]any{
 		"System": system,
 		"Prompt": prompt,
 	}); err != nil {
@@ -221,16 +199,25 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 	return err
 }

-// collate messages based on role. consecutive messages of the same role are merged
-// into a single message. collate also collects and returns all system messages.
-// collate mutates message content adding image tags ([img-%d]) as needed
-func collate(msgs []api.Message) (string, []*api.Message) {
-	var n int
+type messages []*api.Message

-	var system []string
-	var collated []*api.Message
+// collate messages based on role. consecutive messages of the same role are merged
+// into a single message. collate also pulls out and merges messages with Role == "system"
+// which are templated separately. As a side effect, it mangles message content adding image
+// tags ([img-%d]) as needed
+func collate(msgs []api.Message) (system string, collated messages) {
+	var n int
 	for i := range msgs {
 		msg := msgs[i]
+		if msg.Role == "system" {
+			if system != "" {
+				system += "\n\n"
+			}
+
+			system += msg.Content
+			continue
+		}
+
 		for range msg.Images {
 			imageTag := fmt.Sprintf("[img-%d]", n)
 			if !strings.Contains(msg.Content, "[img]") {
@@ -241,10 +228,6 @@ func collate(msgs []api.Message) (string, []*api.Message) {
 			n++
 		}

-		if msg.Role == "system" {
-			system = append(system, msg.Content)
-		}
-
 		if len(collated) > 0 && collated[len(collated)-1].Role == msg.Role {
 			collated[len(collated)-1].Content += "\n\n" + msg.Content
 		} else {
@@ -252,7 +235,7 @@ func collate(msgs []api.Message) (string, []*api.Message) {
 		}
 	}

-	return strings.Join(system, "\n\n"), collated
+	return
 }

 func parseNode(n parse.Node) []string {
@@ -303,72 +286,3 @@ func parseNode(n parse.Node) []string {

 	return nil
 }
-
-// deleteNode walks the node list and deletes nodes that match the predicate
-// this is currently to remove the {{ .Response }} node from templates
-func deleteNode(n parse.Node, fn func(parse.Node) bool) parse.Node {
-	var walk func(n parse.Node) parse.Node
-	walk = func(n parse.Node) parse.Node {
-		if fn(n) {
-			return nil
-		}
-
-		switch t := n.(type) {
-		case *parse.ListNode:
-			var nodes []parse.Node
-			for _, c := range t.Nodes {
-				if n := walk(c); n != nil {
-					nodes = append(nodes, n)
-				}
-			}
-
-			t.Nodes = nodes
-			return t
-		case *parse.IfNode:
-			t.BranchNode = *(walk(&t.BranchNode).(*parse.BranchNode))
-		case *parse.WithNode:
-			t.BranchNode = *(walk(&t.BranchNode).(*parse.BranchNode))
-		case *parse.RangeNode:
-			t.BranchNode = *(walk(&t.BranchNode).(*parse.BranchNode))
-		case *parse.BranchNode:
-			t.List = walk(t.List).(*parse.ListNode)
-			if t.ElseList != nil {
-				t.ElseList = walk(t.ElseList).(*parse.ListNode)
-			}
-		case *parse.ActionNode:
-			n := walk(t.Pipe)
-			if n == nil {
-				return nil
-			}
-
-			t.Pipe = n.(*parse.PipeNode)
-		case *parse.PipeNode:
-			var commands []*parse.CommandNode
-			for _, c := range t.Cmds {
-				var args []parse.Node
-				for _, a := range c.Args {
-					if n := walk(a); n != nil {
-						args = append(args, n)
-					}
-				}
-
-				if len(args) == 0 {
-					return nil
-				}
-
-				c.Args = args
-				commands = append(commands, c)
-			}
-
-			if len(commands) == 0 {
-				return nil
-			}
-
-			t.Cmds = commands
-		}
-
-		return n
-	}
-
-	return walk(n)
-}
--- a/template/template_test.go
+++ b/template/template_test.go
@@ -105,8 +105,8 @@ func TestTemplate(t *testing.T) {
 			}

 			for n, tt := range cases {
-				var actual bytes.Buffer
 				t.Run(n, func(t *testing.T) {
+					var actual bytes.Buffer
 					if err := tmpl.Execute(&actual, Values{Messages: tt}); err != nil {
 						t.Fatal(err)
 					}
@@ -116,34 +116,7 @@ func TestTemplate(t *testing.T) {
 						t.Fatal(err)
 					}

-					bts := actual.Bytes()
-
-					if slices.Contains([]string{"chatqa.gotmpl", "llama2-chat.gotmpl", "mistral-instruct.gotmpl", "openchat.gotmpl", "vicuna.gotmpl"}, match) && bts[len(bts)-1] == ' ' {
-						t.Log("removing trailing space from output")
-						bts = bts[:len(bts)-1]
-					}
-
-					if diff := cmp.Diff(bts, expect); diff != "" {
-						t.Errorf("mismatch (-got +want):\n%s", diff)
-					}
-				})
-
-				t.Run("legacy", func(t *testing.T) {
-					t.Skip("legacy outputs are currently default outputs")
-					var legacy bytes.Buffer
-					if err := tmpl.Execute(&legacy, Values{Messages: tt, forceLegacy: true}); err != nil {
-						t.Fatal(err)
-					}
-
-					legacyBytes := legacy.Bytes()
-					if slices.Contains([]string{"chatqa.gotmpl", "openchat.gotmpl", "vicuna.gotmpl"}, match) && legacyBytes[len(legacyBytes)-1] == ' ' {
-						t.Log("removing trailing space from legacy output")
-						legacyBytes = legacyBytes[:len(legacyBytes)-1]
-					} else if slices.Contains([]string{"codellama-70b-instruct.gotmpl", "llama2-chat.gotmpl", "mistral-instruct.gotmpl"}, match) {
-						t.Skip("legacy outputs cannot be compared to messages outputs")
-					}
-
-					if diff := cmp.Diff(legacyBytes, actual.Bytes()); diff != "" {
+					if diff := cmp.Diff(actual.Bytes(), expect); diff != "" {
 						t.Errorf("mismatch (-got +want):\n%s", diff)
 					}
 				})
@@ -162,24 +135,7 @@ func TestParse(t *testing.T) {
 		{"{{ .System }} {{ .Prompt }} {{ .Response }}", []string{"prompt", "response", "system"}},
 		{"{{ with .Tools }}{{ . }}{{ end }} {{ .System }} {{ .Prompt }}", []string{"prompt", "response", "system", "tools"}},
 		{"{{ range .Messages }}{{ .Role }} {{ .Content }}{{ end }}", []string{"content", "messages", "role"}},
-		{`{{- range .Messages }}
-{{- if eq .Role "system" }}SYSTEM:
-{{- else if eq .Role "user" }}USER:
-{{- else if eq .Role "assistant" }}ASSISTANT:
-{{- end }} {{ .Content }}
-{{- end }}`, []string{"content", "messages", "role"}},
-		{`{{- if .Messages }}
-{{- range .Messages }}<|im_start|>{{ .Role }}
-{{ .Content }}<|im_end|>
-{{ end }}<|im_start|>assistant
-{{ else -}}
-{{ if .System }}<|im_start|>system
-{{ .System }}<|im_end|>
-{{ end }}{{ if .Prompt }}<|im_start|>user
-{{ .Prompt }}<|im_end|>
-{{ end }}<|im_start|>assistant
-{{ .Response }}<|im_end|>
-{{- end -}}`, []string{"content", "messages", "prompt", "response", "role", "system"}},
+		{"{{ range .Messages }}{{ if eq .Role \"system\" }}SYSTEM: {{ .Content }}{{ else if eq .Role \"user\" }}USER: {{ .Content }}{{ else if eq .Role \"assistant\" }}ASSISTANT: {{ .Content }}{{ end }}{{ end }}", []string{"content", "messages", "role"}},
 	}

 	for _, tt := range cases {
@@ -189,8 +145,9 @@ func TestParse(t *testing.T) {
 				t.Fatal(err)
 			}

-			if diff := cmp.Diff(tmpl.Vars(), tt.vars); diff != "" {
-				t.Errorf("mismatch (-got +want):\n%s", diff)
+			vars := tmpl.Vars()
+			if !slices.Equal(tt.vars, vars) {
+				t.Errorf("expected %v, got %v", tt.vars, vars)
 			}
 		})
 	}
@@ -210,17 +167,12 @@ func TestExecuteWithMessages(t *testing.T) {
 		{
 			"mistral",
 			[]template{
-				{"no response", `[INST] {{ if .System }}{{ .System }}
-
-{{ end }}{{ .Prompt }}[/INST] `},
-				{"response", `[INST] {{ if .System }}{{ .System }}
-
-{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`},
-				{"messages", `[INST] {{ if .System }}{{ .System }}
-
-{{ end }}
-{{- range .Messages }}
-{{- if eq .Role "user" }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }}[INST] {{ end }}
+				{"no response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] `},
+				{"response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`},
+				{"messages", `{{- range $index, $_ := .Messages }}
+{{- if eq .Role "user" }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}{{ "\n\n" }}
+{{- end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }}
+{{- end }}
 {{- end }}`},
 			},
 			Values{
@@ -235,17 +187,13 @@ func TestExecuteWithMessages(t *testing.T) {
 		{
 			"mistral system",
 			[]template{
-				{"no response", `[INST] {{ if .System }}{{ .System }}
-
-{{ end }}{{ .Prompt }}[/INST] `},
-				{"response", `[INST] {{ if .System }}{{ .System }}
-
-{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`},
-				{"messages", `[INST] {{ if .System }}{{ .System }}
-
-{{ end }}
-{{- range .Messages }}
-{{- if eq .Role "user" }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }}[INST] {{ end }}
+				{"no response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] `},
+				{"response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`},
+				{"messages", `
+{{- range $index, $_ := .Messages }}
+{{- if eq .Role "user" }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}{{ "\n\n" }}
+{{- end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }}
+{{- end }}
 {{- end }}`},
 			},
 			Values{
@@ -256,9 +204,9 @@ func TestExecuteWithMessages(t *testing.T) {
 					{Role: "user", Content: "What is your name?"},
 				},
 			},
-			`[INST] You are a helpful assistant!
+			`[INST] Hello friend![/INST] Hello human![INST] You are a helpful assistant!

-Hello friend![/INST] Hello human![INST] What is your name?[/INST] `,
+What is your name?[/INST] `,
 		},
 		{
 			"chatml",
@@ -272,9 +220,12 @@ Hello friend![/INST] Hello human![INST] What is your name?[/INST] `,
 {{ .Response }}<|im_end|>
 `},
 				{"messages", `
-{{- range $index, $_ := .Messages }}<|im_start|>{{ .Role }}
-{{ .Content }}<|im_end|>
-{{ end }}<|im_start|>assistant
+{{- range $index, $_ := .Messages }}
+{{- if and (eq .Role "user") (eq (len (slice $.Messages $index)) 1) $.System }}<|im_start|>system
+{{ $.System }}<|im_end|>{{ "\n" }}
+{{- end }}<|im_start|>{{ .Role }}
+{{ .Content }}<|im_end|>{{ "\n" }}
+{{- end }}<|im_start|>assistant
 `},
 			},
 			Values{
@@ -285,12 +236,12 @@ Hello friend![/INST] Hello human![INST] What is your name?[/INST] `,
 					{Role: "user", Content: "What is your name?"},
 				},
 			},
-			`<|im_start|>system
-You are a helpful assistant!<|im_end|>
-<|im_start|>user
+			`<|im_start|>user
 Hello friend!<|im_end|>
 <|im_start|>assistant
 Hello human!<|im_end|>
+<|im_start|>system
+You are a helpful assistant!<|im_end|>
 <|im_start|>user
 What is your name?<|im_end|>
 <|im_start|>assistant
@@ -307,11 +258,9 @@ What is your name?<|im_end|>
 `},
 				{"messages", `
 {{- range .Messages }}
-{{- if eq .Role "user" }}Question: {{ .Content }}
-
-{{ else if eq .Role "assistant" }}Answer: {{ .Content }}
-
-{{ end }}
+{{- if eq .Role "user" }}Question: {{ .Content }}{{ "\n\n" }}
+{{- else if eq .Role "assistant" }}Answer: {{ .Content }}{{ "\n\n" }}
+{{- end }}
 {{- end }}Answer: `},
 			},
 			Values{
@@ -351,8 +300,8 @@ Answer: `,
 						t.Fatal(err)
 					}

-					if diff := cmp.Diff(b.String(), tt.expected); diff != "" {
-						t.Errorf("mismatch (-got +want):\n%s", diff)
+					if b.String() != tt.expected {
+						t.Errorf("expected\n%s,\ngot\n%s", tt.expected, b.String())
 					}
 				})
 			}
--- a/template/testdata/alpaca.gotmpl/system-user-assistant-user
+++ b/template/testdata/alpaca.gotmpl/system-user-assistant-user
@@ -1,6 +1,4 @@
-You are a helpful assistant.
-
-### Instruction:
+You are a helpful assistant.### Instruction:
 Hello, how are you?

 ### Response:
--- a/template/testdata/codellama-70b-instruct.gotmpl/system-user-assistant-user
+++ b/template/testdata/codellama-70b-instruct.gotmpl/system-user-assistant-user
@@ -9,4 +9,3 @@ Source: system
 I'd like to show off how chat templating works! <step> Source: assistant
 Destination: user

- 
--- a/template/testdata/codellama-70b-instruct.gotmpl/user
+++ b/template/testdata/codellama-70b-instruct.gotmpl/user
@@ -3,4 +3,3 @@ Source: user
 Hello, how are you? <step> Source: assistant
 Destination: user

- 
--- a/template/testdata/codellama-70b-instruct.gotmpl/user-assistant-user
+++ b/template/testdata/codellama-70b-instruct.gotmpl/user-assistant-user
@@ -7,4 +7,3 @@ Source: user
 I'd like to show off how chat templating works! <step> Source: assistant
 Destination: user

- 
--- a/template/testdata/llama2-chat.gotmpl/system-user-assistant-user
+++ b/template/testdata/llama2-chat.gotmpl/system-user-assistant-user
@@ -2,6 +2,4 @@
 You are a helpful assistant.
 <</SYS>>

-Hello, how are you? [/INST] I'm doing great. How can I help you today?</s><s>[INST] <<SYS>><</SYS>>
-
-I'd like to show off how chat templating works! [/INST]
+Hello, how are you? [/INST] I'm doing great. How can I help you today?</s><s>[INST] I'd like to show off how chat templating works! [/INST]
--- a/template/testdata/llama2-chat.gotmpl/user-assistant-user
+++ b/template/testdata/llama2-chat.gotmpl/user-assistant-user
@@ -1,5 +1,3 @@
 [INST] <<SYS>><</SYS>>

-Hello, how are you? [/INST] I'm doing great. How can I help you today?</s><s>[INST] <<SYS>><</SYS>>
-
-I'd like to show off how chat templating works! [/INST]
+Hello, how are you? [/INST] I'm doing great. How can I help you today?</s><s>[INST] I'd like to show off how chat templating works! [/INST]
--- a/template/testdata/mistral-instruct.gotmpl/system-user-assistant-user
+++ b/template/testdata/mistral-instruct.gotmpl/system-user-assistant-user
@@ -1,3 +1,2 @@
-[INST] You are a helpful assistant.
-
-Hello, how are you?[/INST] I'm doing great. How can I help you today?</s>[INST] I'd like to show off how chat templating works![/INST]
+[INST] Hello, how are you?[/INST] I'm doing great. How can I help you today?</s>[INST] You are a helpful assistant.
+I'd like to show off how chat templating works![/INST]
--- a/template/testdata/openchat.gotmpl/system-user-assistant-user
+++ b/template/testdata/openchat.gotmpl/system-user-assistant-user
@@ -1 +1 @@
-GPT4 Correct System: You are a helpful assistant.<|end_of_turn|>GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT4 Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT4 Correct Assistant:
+GPT Correct System: You are a helpful assistant.<|end_of_turn|>GPT Correct User: Hello, how are you?<|end_of_turn|>GPT Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT Correct Assistant:
--- a/template/testdata/openchat.gotmpl/user
+++ b/template/testdata/openchat.gotmpl/user
@@ -1 +1 @@
-GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant:
+GPT Correct User: Hello, how are you?<|end_of_turn|>GPT Correct Assistant:
--- a/template/testdata/openchat.gotmpl/user-assistant-user
+++ b/template/testdata/openchat.gotmpl/user-assistant-user
@@ -1 +1 @@
-GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT4 Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT4 Correct Assistant:
+GPT Correct User: Hello, how are you?<|end_of_turn|>GPT Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT Correct Assistant:
--- a/template/vicuna.gotmpl
+++ b/template/vicuna.gotmpl
@@ -1,4 +1,14 @@
-{{ if .System }}{{ .System }}
+{{- if .Messages }}
+{{- if .System }}{{ .System }}

+{{ end }}
+{{- range .Messages }}
+{{- if eq .Role "user" }}USER: {{ .Content }}
+{{ else if eq .Role "assistant" }}ASSISTANT: {{ .Content }}</s>
+{{ end }}
+{{- end }}ASSISTANT:
+{{- else }}
+{{ if .System }}{{ .System }}
 {{ end }}{{ if .Prompt }}USER: {{ .Prompt }}
-{{ end }}ASSISTANT: {{ .Response }}</s>
+{{ end }}ASSISTANT: {{ .Response }}
+{{- end }}
--- a/template/zephyr.gotmpl
+++ b/template/zephyr.gotmpl
@@ -1,6 +1,15 @@
+{{- if .Messages }}
+{{- if .System }}<|system|>
+{{ .System }}</s>
+{{ end }}
+{{- range .Messages }}<|{{ .Role }}|>
+{{ .Content }}</s>
+{{ end }}<|assistant|>
+{{ else }}
 {{ if .System }}<|system|>
 {{ .System }}</s>
 {{ end }}{{ if .Prompt }}<|user|>
 {{ .Prompt }}</s>
 {{ end }}<|assistant|>
 {{ .Response }}</s>
+{{- end }}