check with HEAD before upload

handle edge case when a blob has been uploaded earlier
mlxrunner: Fix panic on full KV cache hit
2026-02-28 04:56:37 -05:00 · 2026-02-27 14:29:17 -08:00 · 2026-02-27 14:12:15 -08:00 · 2026-02-27 11:07:03 -08:00 · 2026-02-26 18:38:27 -08:00
9 changed files with 44 additions and 2 deletions
--- a/api/types.go
+++ b/api/types.go
@@ -15,6 +15,7 @@ import (
 	"github.com/google/uuid"

 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/internal/orderedmap"
 	"github.com/ollama/ollama/types/model"
 )
@@ -569,6 +570,7 @@ type DebugInfo struct {

 type Metrics struct {
 	TotalDuration      time.Duration `json:"total_duration,omitempty"`
+	PeakMemory         uint64        `json:"peak_memory,omitempty"`
 	LoadDuration       time.Duration `json:"load_duration,omitempty"`
 	PromptEvalCount    int           `json:"prompt_eval_count,omitempty"`
 	PromptEvalDuration time.Duration `json:"prompt_eval_duration,omitempty"`
@@ -934,6 +936,10 @@ func (m *Metrics) Summary() {
 		fmt.Fprintf(os.Stderr, "total duration:       %v\n", m.TotalDuration)
 	}

+	if m.PeakMemory > 0 {
+		fmt.Fprintf(os.Stderr, "peak memory:          %s\n", formatPeakMemory(m.PeakMemory))
+	}
+
 	if m.LoadDuration > 0 {
 		fmt.Fprintf(os.Stderr, "load duration:        %v\n", m.LoadDuration)
 	}
@@ -957,6 +963,14 @@ func (m *Metrics) Summary() {
 	}
 }

+func formatPeakMemory(b uint64) string {
+	if b >= format.GibiByte {
+		return fmt.Sprintf("%.3f GiB", float64(b)/float64(format.GibiByte))
+	}
+
+	return format.HumanBytes2(b)
+}
+
 func (opts *Options) FromMap(m map[string]any) error {
 	valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
 	typeOpts := reflect.TypeOf(opts).Elem()   // types of the fields in the options struct
--- a/llm/server.go
+++ b/llm/server.go
@@ -1518,6 +1518,7 @@ type CompletionResponse struct {
 	PromptEvalDuration time.Duration `json:"prompt_eval_duration"`
 	EvalCount          int           `json:"eval_count"`
 	EvalDuration       time.Duration `json:"eval_duration"`
+	PeakMemory         uint64        `json:"peak_memory,omitempty"`

 	// Logprobs contains log probability information if requested
 	Logprobs []Logprob `json:"logprobs,omitempty"`
--- a/server/routes.go
+++ b/server/routes.go
@@ -557,6 +557,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 					PromptEvalDuration: cr.PromptEvalDuration,
 					EvalCount:          cr.EvalCount,
 					EvalDuration:       cr.EvalDuration,
+					PeakMemory:         cr.PeakMemory,
 				},
 				Logprobs: toAPILogprobs(cr.Logprobs),
 			}
@@ -2309,6 +2310,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 						PromptEvalDuration: r.PromptEvalDuration,
 						EvalCount:          r.EvalCount,
 						EvalDuration:       r.EvalDuration,
+						PeakMemory:         r.PeakMemory,
 					},
 					Logprobs: toAPILogprobs(r.Logprobs),
 				}
--- a/x/imagegen/transfer/upload.go
+++ b/x/imagegen/transfer/upload.go
@@ -150,6 +150,14 @@ func (u *uploader) uploadOnce(ctx context.Context, blob Blob) (int64, error) {
 		u.logger.Debug("uploading blob", "digest", blob.Digest, "size", blob.Size)
 	}

+	// Check if blob already exists (may have been uploaded by another
+	// concurrent process since the initial HEAD check)
+	if exists, err := u.exists(ctx, blob); err != nil {
+		return 0, err
+	} else if exists {
+		return 0, nil
+	}
+
 	// Init upload
 	uploadURL, err := u.initUpload(ctx, blob)
 	if err != nil {
--- a/x/mlxrunner/cache.go
+++ b/x/mlxrunner/cache.go
@@ -78,6 +78,12 @@ func (c *kvCache) findRemaining(tokens []int32) []int32 {
 		prefix++
 	}

+	// Always keep at least one token to re-evaluate so the
+	// pipeline can seed token generation from it.
+	if prefix == len(tokens) && prefix > 0 {
+		prefix--
+	}
+
 	if prefix < len(c.tokens) {
 		trim := len(c.tokens) - prefix
 		for _, kv := range c.caches {
--- a/x/mlxrunner/client.go
+++ b/x/mlxrunner/client.go
@@ -268,6 +268,7 @@ func (c *Client) Completion(ctx context.Context, req llm.CompletionRequest, fn f
 			PromptEvalDuration int    `json:"prompt_eval_duration,omitempty"`
 			EvalCount          int    `json:"eval_count,omitempty"`
 			EvalDuration       int    `json:"eval_duration,omitempty"`
+			PeakMemory         uint64 `json:"peak_memory,omitempty"`
 		}
 		if err := json.Unmarshal(scanner.Bytes(), &raw); err != nil {
 			slog.Debug("mlx response parse error", "error", err, "line", string(scanner.Bytes()))
@@ -282,6 +283,7 @@ func (c *Client) Completion(ctx context.Context, req llm.CompletionRequest, fn f
 			PromptEvalDuration: time.Duration(raw.PromptEvalDuration),
 			EvalCount:          raw.EvalCount,
 			EvalDuration:       time.Duration(raw.EvalDuration),
+			PeakMemory:         raw.PeakMemory,
 		}

 		fn(cresp)
--- a/x/mlxrunner/mlx/memory.go
+++ b/x/mlxrunner/mlx/memory.go
@@ -64,6 +64,10 @@ func PeakMemory() int {
 	return int(peak)
 }

+func ResetPeakMemory() {
+	C.mlx_reset_peak_memory()
+}
+
 type Memory struct{}

 func (Memory) LogValue() slog.Value {
--- a/x/mlxrunner/pipeline.go
+++ b/x/mlxrunner/pipeline.go
@@ -44,8 +44,13 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 	} else {
 		mlx.DisableCompile()
 	}
+	mlx.ResetPeakMemory()

 	inputs := r.Tokenizer.Encode(request.Prompt, true)
+	if len(inputs) == 0 {
+		return errors.New("empty prompt")
+	}
+
 	session := r.cache.begin(r.Model, inputs)
 	defer session.close()

@@ -53,7 +58,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 	tokens := session.remaining

 	total, processed := len(tokens), 0
-	slog.Info("Prompt processing progress", "processed", processed, "total", total)
 	for total-processed > 1 {
 		if err := request.Ctx.Err(); err != nil {
 			return err
@@ -103,7 +107,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 		nextSample, nextLogprobs = step(sample)

 		if i == 0 {
-			slog.Info("Prompt processing progress", "processed", total, "total", total)
 			mlx.Eval(sample)
 			final.PromptTokensDuration = time.Since(now)
 			now = time.Now()
@@ -138,6 +141,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 	}

 	final.CompletionTokensDuration = time.Since(now)
+	final.PeakMemory = uint64(mlx.PeakMemory())
 	select {
 	case <-request.Ctx.Done():
 		return request.Ctx.Err()
--- a/x/mlxrunner/runner.go
+++ b/x/mlxrunner/runner.go
@@ -54,6 +54,7 @@ type Response struct {
 	PromptTokensDuration     time.Duration `json:"prompt_eval_duration,omitempty"`
 	CompletionTokens         int           `json:"eval_count,omitempty"`
 	CompletionTokensDuration time.Duration `json:"eval_duration,omitempty"`
+	PeakMemory               uint64        `json:"peak_memory,omitempty"`
 	TotalTokens              int           `json:"total_tokens,omitempty"`
 }
Author	SHA1	Message	Date
Dong Chen	241ac6e003	check with HEAD before upload	2026-02-27 14:29:17 -08:00
Dong Chen	17dc6918cf	handle edge case when a blob has been uploaded earlier	2026-02-27 14:12:15 -08:00
Jesse Gross	dd5eb6337d	mlxrunner: Fix panic on full KV cache hit When the entire prompt was already cached (e.g. repeated prompt), findRemaining returned an empty slice, causing FromValues to panic on an index-out-of-range accessing a zero-length byte slice. Fix by always keeping at least one token to re-evaluate so the pipeline can seed token generation. Also reject empty prompts early rather than panicking.	2026-02-27 11:07:03 -08:00
Patrick Devine	79917cf80b	show peak memory usage (#14485 )	2026-02-26 18:38:27 -08:00