server: fix context, load_duration and total_duration fields (#5676 )

* server: fix `contet`, `load_duration` and `total_duration` fields * Update server/routes.go
llm: looser checks for minimum memory (#5677 )
2026-01-10 08:28:20 -05:00 · 2024-07-13 09:25:31 -07:00 · 2024-07-13 09:20:05 -07:00 · 2024-07-13 08:33:46 -07:00 · 2024-07-12 21:04:44 -07:00 · 2024-07-12 15:47:17 -07:00
6 changed files with 99 additions and 41 deletions
--- a/README.md
+++ b/README.md
@@ -293,6 +293,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
+- [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)

 ### Terminal

--- a/llm/server.go
+++ b/llm/server.go
@@ -127,7 +127,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	// On linux, over-allocating CPU memory will almost always result in an error
 	if runtime.GOOS == "linux" {
 		systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
-		available := min(systemTotalMemory, systemFreeMemory+systemSwapFreeMemory)
+		available := systemFreeMemory + systemSwapFreeMemory
 		if systemMemoryRequired > available {
 			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory))
 			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -4,7 +4,6 @@ import (
 	"bytes"
 	"context"
 	"log/slog"
-	"slices"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llm"
@@ -17,26 +16,18 @@ type tokenizeFunc func(context.Context, string) ([]int, error)
 // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
 // latest message and 2) system messages
 func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message) (prompt string, images []llm.ImageData, _ error) {
-	// pull out any system messages which should always be included in the prompt
 	var system []api.Message
-	msgs = slices.DeleteFunc(msgs, func(m api.Message) bool {
-		if m.Role == "system" {
-			system = append(system, m)
-			return true
-		}
-
-		return false
-	})
-
-	if len(system) == 0 && m.System != "" {
-		// add model system prompt since it wasn't provided
-		system = append(system, api.Message{Role: "system", Content: m.System})
-	}
-
 	// always include the last message
 	n := len(msgs) - 1
 	// in reverse, find all messages that fit into context window
 	for i := n - 1; i >= 0; i-- {
+		system = make([]api.Message, 0)
+		for j := range i {
+			if msgs[j].Role == "system" {
+				system = append(system, msgs[j])
+			}
+		}
+
 		var b bytes.Buffer
 		if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...)}); err != nil {
 			return "", nil, err
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -6,6 +6,7 @@ import (
 	"strings"
 	"testing"

+	"github.com/google/go-cmp/cmp"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/template"
 )
@@ -164,6 +165,19 @@ func TestChatPrompt(t *testing.T) {
 				prompt: "You are the Test Who Lived. You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
 			},
 		},
+		{
+			name:  "out of order system",
+			limit: 2048,
+			msgs: []api.Message{
+				{Role: "user", Content: "You're a test, Harry!"},
+				{Role: "assistant", Content: "I-I'm a what?"},
+				{Role: "system", Content: "You are the Test Who Lived."},
+				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
+			},
+			expect: expect{
+				prompt: "You're a test, Harry! I-I'm a what? You are the Test Who Lived. A test. And a thumping good one at that, I'd wager. ",
+			},
+		},
 	}

 	tmpl, err := template.Parse(`
@@ -187,6 +201,10 @@ func TestChatPrompt(t *testing.T) {
 				t.Errorf("expected %q, got %q", tt.prompt, prompt)
 			}

+			if diff := cmp.Diff(prompt, tt.prompt); diff != "" {
+				t.Errorf("mismatch (-got +want):\n%s", diff)
+			}
+
 			if len(images) != len(tt.images) {
 				t.Fatalf("expected %d images, got %d", len(tt.images), len(images))
 			}
--- a/server/routes.go
+++ b/server/routes.go
@@ -102,6 +102,7 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []Capabil
 }

 func (s *Server) GenerateHandler(c *gin.Context) {
+	checkpointStart := time.Now()
 	var req api.GenerateRequest
 	if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@@ -129,6 +130,8 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}

+	checkpointLoaded := time.Now()
+
 	if req.Prompt == "" {
 		c.JSON(http.StatusOK, api.GenerateResponse{
 			Model:      req.Model,
@@ -191,26 +194,48 @@ func (s *Server) GenerateHandler(c *gin.Context) {

 	ch := make(chan any)
 	go func() {
+		// TODO (jmorganca): avoid building the response twice both here and below
+		var sb strings.Builder
 		defer close(ch)
 		if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
 			Prompt:  prompt,
 			Images:  images,
 			Format:  req.Format,
 			Options: opts,
-		}, func(r llm.CompletionResponse) {
-			ch <- api.GenerateResponse{
+		}, func(cr llm.CompletionResponse) {
+			res := api.GenerateResponse{
 				Model:      req.Model,
 				CreatedAt:  time.Now().UTC(),
-				Response:   r.Content,
-				Done:       r.Done,
-				DoneReason: r.DoneReason,
+				Response:   cr.Content,
+				Done:       cr.Done,
+				DoneReason: cr.DoneReason,
 				Metrics: api.Metrics{
-					PromptEvalCount:    r.PromptEvalCount,
-					PromptEvalDuration: r.PromptEvalDuration,
-					EvalCount:          r.EvalCount,
-					EvalDuration:       r.EvalDuration,
+					PromptEvalCount:    cr.PromptEvalCount,
+					PromptEvalDuration: cr.PromptEvalDuration,
+					EvalCount:          cr.EvalCount,
+					EvalDuration:       cr.EvalDuration,
 				},
 			}
+
+			if _, err := sb.WriteString(cr.Content); err != nil {
+				ch <- gin.H{"error": err.Error()}
+			}
+
+			if cr.Done {
+				res.TotalDuration = time.Since(checkpointStart)
+				res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
+
+				if !req.Raw {
+					tokens, err := r.Tokenize(c.Request.Context(), prompt+sb.String())
+					if err != nil {
+						ch <- gin.H{"error": err.Error()}
+						return
+					}
+					res.Context = append(req.Context, tokens...)
+				}
+			}
+
+			ch <- res
 		}); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
@@ -1122,6 +1147,8 @@ func (s *Server) ProcessHandler(c *gin.Context) {
 }

 func (s *Server) ChatHandler(c *gin.Context) {
+	checkpointStart := time.Now()
+
 	var req api.ChatRequest
 	if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@@ -1141,6 +1168,8 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}

+	checkpointLoaded := time.Now()
+
 	if len(req.Messages) == 0 {
 		c.JSON(http.StatusOK, api.ChatResponse{
 			Model:      req.Model,
@@ -1169,7 +1198,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			Format:  req.Format,
 			Options: opts,
 		}, func(r llm.CompletionResponse) {
-			ch <- api.ChatResponse{
+			res := api.ChatResponse{
 				Model:      req.Model,
 				CreatedAt:  time.Now().UTC(),
 				Message:    api.Message{Role: "assistant", Content: r.Content},
@@ -1182,6 +1211,13 @@ func (s *Server) ChatHandler(c *gin.Context) {
 					EvalDuration:       r.EvalDuration,
 				},
 			}
+
+			if r.Done {
+				res.TotalDuration = time.Since(checkpointStart)
+				res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
+			}
+
+			ch <- res
 		}); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
--- a/template/template.go
+++ b/template/template.go
@@ -149,27 +149,19 @@ type Values struct {
 }

 func (t *Template) Execute(w io.Writer, v Values) error {
-	system, collated := collate(v.Messages)
+	system, messages := collate(v.Messages)
 	if !v.forceLegacy && slices.Contains(t.Vars(), "messages") {
 		return t.Template.Execute(w, map[string]any{
 			"System":   system,
-			"Messages": collated,
+			"Messages": messages,
 		})
 	}

+	system = ""
 	var b bytes.Buffer
 	var prompt, response string
-	for i, m := range collated {
-		switch m.Role {
-		case "system":
-			system = m.Content
-		case "user":
-			prompt = m.Content
-		case "assistant":
-			response = m.Content
-		}
-
-		if i != len(collated)-1 && prompt != "" && response != "" {
+	for _, m := range messages {
+		execute := func () error {
 			if err := t.Template.Execute(&b, map[string]any{
 				"System":   system,
 				"Prompt":   prompt,
@@ -181,6 +173,26 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 			system = ""
 			prompt = ""
 			response = ""
+			return nil
+		}
+
+		switch m.Role {
+		case "system":
+			if prompt != "" || response != "" {
+				if err := execute(); err != nil {
+					return err
+				}
+			}
+			system = m.Content
+		case "user":
+			if response != "" {
+				if err := execute(); err != nil {
+					return err
+				}
+			}
+			prompt = m.Content
+		case "assistant":
+			response = m.Content
 		}
 	}

@@ -199,7 +211,7 @@ func (t *Template) Execute(w io.Writer, v Values) error {

 	tree := parse.Tree{Root: nodes.(*parse.ListNode)}
 	if err := template.Must(template.New("").AddParseTree("", &tree)).Execute(&b, map[string]any{
-		"System": "",
+		"System": system,
 		"Prompt": prompt,
 	}); err != nil {
 		return err
Author	SHA1	Message	Date
Jeffrey Morgan	1ed0aa8fea	server: fix `context`, `load_duration` and `total_duration` fields (#5676 ) * server: fix `contet`, `load_duration` and `total_duration` fields * Update server/routes.go	2024-07-13 09:25:31 -07:00
Jeffrey Morgan	ef98803d63	llm: looser checks for minimum memory (#5677 )	2024-07-13 09:20:05 -07:00
Jarek	02fea420e5	Add Kerlig AI, an app for macOS (#5675 )	2024-07-13 08:33:46 -07:00
Michael Yang	22c5451fc2	fix system prompt (#5662 ) * fix system prompt * execute template when hitting previous roles * fix tests --------- Co-authored-by: jmorganca <jmorganca@gmail.com>	2024-07-12 21:04:44 -07:00
Patrick Devine	23ebbaa46e	Revert "remove template from tests" This reverts commit `9ac0a7a50b`.	2024-07-12 15:47:17 -07:00
Patrick Devine	9ac0a7a50b	remove template from tests	2024-07-12 15:41:31 -07:00