add MaxContextLength()

address comments
cleanup
2026-03-01 05:26:49 -05:00 · 2026-02-28 16:59:58 -08:00 · 2026-02-28 16:56:54 -08:00 · 2026-02-28 16:56:54 -08:00 · 2026-02-28 16:56:54 -08:00 · 2026-02-28 16:56:54 -08:00
17 changed files with 240 additions and 981 deletions
--- a/llm/server.go
+++ b/llm/server.go
@@ -1453,12 +1453,10 @@ type ImageData struct {
 }

 type CompletionRequest struct {
-	Prompt          string
-	Format          json.RawMessage
-	Images          []ImageData
-	Options         *api.Options
-	Think           *api.ThinkValue
-	ExplicitOptions map[string]struct{}
+	Prompt  string
+	Format  json.RawMessage
+	Images  []ImageData
+	Options *api.Options

 	Grammar  string // set before sending the request to the subprocess
 	Shift    bool
--- a/server/routes.go
+++ b/server/routes.go
@@ -130,35 +130,6 @@ func (s *Server) modelOptions(model *Model, requestOpts map[string]any) (api.Opt
 	return opts, nil
 }

-func explicitOptions(modelOpts, requestOpts map[string]any) map[string]struct{} {
-	keys := []string{
-		"temperature",
-		"top_p",
-		"min_p",
-		"top_k",
-		"repeat_last_n",
-		"repeat_penalty",
-		"presence_penalty",
-		"frequency_penalty",
-	}
-
-	explicit := make(map[string]struct{}, len(keys))
-	for _, key := range keys {
-		if optionSpecified(modelOpts, requestOpts, key) {
-			explicit[key] = struct{}{}
-		}
-	}
-	return explicit
-}
-
-func optionSpecified(modelOpts, requestOpts map[string]any, key string) bool {
-	if _, ok := requestOpts[key]; ok {
-		return true
-	}
-	_, ok := modelOpts[key]
-	return ok
-}
-
 // scheduleRunner schedules a runner after validating inputs such as capabilities and model options.
 // It returns the allocated runner, model instance, and consolidated options if successful and error otherwise.
 func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.Capability, requestOpts map[string]any, keepAlive *api.Duration) (llm.LlamaServer, *Model, *api.Options, error) {
@@ -568,16 +539,14 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		var sb strings.Builder
 		defer close(ch)
 		if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
-			Prompt:          prompt,
-			Images:          images,
-			Format:          req.Format,
-			Options:         opts,
-			Think:           req.Think,
-			ExplicitOptions: explicitOptions(m.Options, req.Options),
-			Shift:           req.Shift == nil || *req.Shift,
-			Truncate:        req.Truncate == nil || *req.Truncate,
-			Logprobs:        req.Logprobs,
-			TopLogprobs:     req.TopLogprobs,
+			Prompt:      prompt,
+			Images:      images,
+			Format:      req.Format,
+			Options:     opts,
+			Shift:       req.Shift == nil || *req.Shift,
+			Truncate:    req.Truncate == nil || *req.Truncate,
+			Logprobs:    req.Logprobs,
+			TopLogprobs: req.TopLogprobs,
 		}, func(cr llm.CompletionResponse) {
 			res := api.GenerateResponse{
 				Model:     req.Model,
@@ -2329,16 +2298,14 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			// sets up new context given parent context per request
 			ctx, cancel := context.WithCancel(c.Request.Context())
 			err := r.Completion(ctx, llm.CompletionRequest{
-				Prompt:          prompt,
-				Images:          images,
-				Format:          currentFormat,
-				Options:         opts,
-				Think:           req.Think,
-				ExplicitOptions: explicitOptions(m.Options, req.Options),
-				Shift:           req.Shift == nil || *req.Shift,
-				Truncate:        truncate,
-				Logprobs:        req.Logprobs,
-				TopLogprobs:     req.TopLogprobs,
+				Prompt:      prompt,
+				Images:      images,
+				Format:      currentFormat,
+				Options:     opts,
+				Shift:       req.Shift == nil || *req.Shift,
+				Truncate:    truncate,
+				Logprobs:    req.Logprobs,
+				TopLogprobs: req.TopLogprobs,
 			}, func(r llm.CompletionResponse) {
 				res := api.ChatResponse{
 					Model:     req.Model,
--- a/x/mlxrunner/cache.go
+++ b/x/mlxrunner/cache.go
@@ -109,6 +109,8 @@ func (s *cacheSession) close() {
 		if kv == nil {
 			continue
 		}
+		// Mixed cache types (e.g. recurrent + KV) can transiently report different
+		// offsets, so use the minimum as the safe reusable token prefix.
 		if off := kv.Offset(); offset < 0 || off < offset {
 			offset = off
 		}
--- a/x/mlxrunner/cache/recurrent.go
+++ b/x/mlxrunner/cache/recurrent.go
@@ -20,21 +20,19 @@ type RecurrentCache struct {
 	headKDim  int
 }

-func (c *RecurrentCache) setStateMaterialized(dst **mlx.Array, v *mlx.Array) {
+func (c *RecurrentCache) setStateMaterialized(old, v *mlx.Array) *mlx.Array {
 	if v == nil || !v.Valid() {
-		return
+		return old
 	}
-	if *dst == v {
-		return
+	if old == v {
+		return old
 	}

 	// Break dependency chains so recurrent state does not retain the full
 	// per-token compute graph over time.
-	snap := mlx.Snapshot(v)
+	snap := mlx.Copy(v)
 	mlx.Eval(snap)

-	old := *dst
-	*dst = snap
 	mlx.Pin(snap)

 	// Drop references to the previous cached state root and transient incoming
@@ -46,40 +44,40 @@ func (c *RecurrentCache) setStateMaterialized(dst **mlx.Array, v *mlx.Array) {
 	if v != snap && v != old {
 		mlx.Unpin(v)
 	}
+
+	return snap
 }

-func (c *RecurrentCache) setStateRaw(dst **mlx.Array, v *mlx.Array) {
+func (c *RecurrentCache) setStateRaw(old, v *mlx.Array) *mlx.Array {
 	if v == nil || !v.Valid() {
-		return
+		return old
 	}
-	if *dst == v {
-		return
+	if old == v {
+		return old
 	}

-	old := *dst
-	*dst = v
 	mlx.Pin(v)
 	if old != nil && old != v {
 		mlx.Unpin(old)
 	}
+
+	return v
 }

-func (c *RecurrentCache) setStateDetached(dst **mlx.Array, v *mlx.Array, ensureContiguous bool) {
+func (c *RecurrentCache) setStateDetached(old, v *mlx.Array, ensureContiguous bool) *mlx.Array {
 	if v == nil || !v.Valid() {
-		return
+		return old
 	}
-	if *dst == v {
-		return
+	if old == v {
+		return old
 	}

 	root := v
 	if ensureContiguous {
 		root = mlx.Contiguous(v, false)
 	}
-	detached := mlx.Detach(root)
+	detached := root.Clone()

-	old := *dst
-	*dst = detached
 	mlx.Pin(detached)
 	if old != nil && old != detached {
 		mlx.Unpin(old)
@@ -88,13 +86,15 @@ func (c *RecurrentCache) setStateDetached(dst **mlx.Array, v *mlx.Array, ensureC
 	// Intentionally do not force-release root/v here. In the fast path, the detached
 	// handle aliases the same MLX value and may still be lazily computed. Releasing the
 	// source handles can invalidate the cached state before the next eval/sweep point.
+
+	return detached
 }

 func snapshotPinned(a *mlx.Array) *mlx.Array {
 	if a == nil || !a.Valid() {
 		return nil
 	}
-	snap := mlx.Snapshot(a)
+	snap := mlx.Copy(a)
 	mlx.Eval(snap)
 	mlx.Pin(snap)
 	return snap
@@ -124,10 +124,10 @@ func (c *RecurrentCache) ensure(batch int, dtype mlx.DType) {
 	}

 	if needConv {
-		c.setStateRaw(&c.convState, mlx.Zeros(dtype, batch, c.convTail, c.convDim))
+		c.convState = c.setStateRaw(c.convState, mlx.Zeros(dtype, batch, c.convTail, c.convDim))
 	}
 	if needDelta {
-		c.setStateRaw(&c.deltaState, mlx.Zeros(dtype, batch, c.numVHeads, c.headVDim, c.headKDim))
+		c.deltaState = c.setStateRaw(c.deltaState, mlx.Zeros(dtype, batch, c.numVHeads, c.headVDim, c.headKDim))
 	}
 }

@@ -137,7 +137,7 @@ func (c *RecurrentCache) ConvState(batch int, dtype mlx.DType) *mlx.Array {
 }

 func (c *RecurrentCache) SetConvState(v *mlx.Array) {
-	c.setStateMaterialized(&c.convState, v)
+	c.convState = c.setStateMaterialized(c.convState, v)
 }

 // SetConvStateFast stores conv state without forcing an immediate snapshot/eval.
@@ -145,7 +145,7 @@ func (c *RecurrentCache) SetConvState(v *mlx.Array) {
 // sync/sweep point. The conv-state input is usually a slice view, so request a
 // compact contiguous copy to avoid pinning the whole source buffer.
 func (c *RecurrentCache) SetConvStateFast(v *mlx.Array) {
-	c.setStateDetached(&c.convState, v, true)
+	c.convState = c.setStateDetached(c.convState, v, true)
 }

 func (c *RecurrentCache) DeltaState(batch int, dtype mlx.DType) *mlx.Array {
@@ -154,14 +154,14 @@ func (c *RecurrentCache) DeltaState(batch int, dtype mlx.DType) *mlx.Array {
 }

 func (c *RecurrentCache) SetDeltaState(v *mlx.Array) {
-	c.setStateMaterialized(&c.deltaState, v)
+	c.deltaState = c.setStateMaterialized(c.deltaState, v)
 }

 // SetDeltaStateFast stores delta state without forcing an immediate snapshot/eval.
 // Use only for decode hot paths that accept higher transient memory until the next
 // sync/sweep point.
 func (c *RecurrentCache) SetDeltaStateFast(v *mlx.Array) {
-	c.setStateDetached(&c.deltaState, v, false)
+	c.deltaState = c.setStateDetached(c.deltaState, v, false)
 }

 func (c *RecurrentCache) Advance(n int) {
--- a/x/mlxrunner/client.go
+++ b/x/mlxrunner/client.go
@@ -182,20 +182,15 @@ func (c *Client) waitUntilRunning() error {
 // completionRequest is a properly-tagged version of llm.CompletionRequest for JSON serialization.
 type completionRequest struct {
 	Prompt  string          `json:"prompt"`
-	Think   *bool           `json:"think,omitempty"`
 	Options *completionOpts `json:"options,omitempty"`
 }

 type completionOpts struct {
-	Temperature      *float32 `json:"temperature,omitempty"`
-	TopP             *float32 `json:"top_p,omitempty"`
-	MinP             *float32 `json:"min_p,omitempty"`
-	TopK             *int     `json:"top_k,omitempty"`
-	RepeatLastN      *int     `json:"repeat_last_n,omitempty"`
-	RepeatPenalty    *float32 `json:"repeat_penalty,omitempty"`
-	PresencePenalty  *float32 `json:"presence_penalty,omitempty"`
-	FrequencyPenalty *float32 `json:"frequency_penalty,omitempty"`
-	NumPredict       int      `json:"num_predict,omitempty"`
+	Temperature float32 `json:"temperature,omitempty"`
+	TopP        float32 `json:"top_p,omitempty"`
+	MinP        float32 `json:"min_p,omitempty"`
+	TopK        int     `json:"top_k,omitempty"`
+	NumPredict  int     `json:"num_predict,omitempty"`
 }

 type CompletionResponse struct {
@@ -233,27 +228,16 @@ func (c *Client) Close() error {

 // Completion implements llm.LlamaServer.
 func (c *Client) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
-	var think *bool
-	if req.Think != nil {
-		enabled := req.Think.Bool()
-		think = &enabled
-	}
-
 	creq := completionRequest{
 		Prompt: req.Prompt,
-		Think:  think,
 	}
 	if req.Options != nil {
 		creq.Options = &completionOpts{
-			Temperature:      float32Ptr(req.Options.Temperature, hasExplicitOption(req.ExplicitOptions, "temperature")),
-			TopP:             float32Ptr(req.Options.TopP, hasExplicitOption(req.ExplicitOptions, "top_p")),
-			MinP:             float32Ptr(req.Options.MinP, hasExplicitOption(req.ExplicitOptions, "min_p")),
-			TopK:             intPtr(req.Options.TopK, hasExplicitOption(req.ExplicitOptions, "top_k")),
-			RepeatLastN:      intPtr(req.Options.RepeatLastN, hasExplicitOption(req.ExplicitOptions, "repeat_last_n")),
-			RepeatPenalty:    float32Ptr(req.Options.RepeatPenalty, hasExplicitOption(req.ExplicitOptions, "repeat_penalty")),
-			PresencePenalty:  float32Ptr(req.Options.PresencePenalty, hasExplicitOption(req.ExplicitOptions, "presence_penalty")),
-			FrequencyPenalty: float32Ptr(req.Options.FrequencyPenalty, hasExplicitOption(req.ExplicitOptions, "frequency_penalty")),
-			NumPredict:       req.Options.NumPredict,
+			Temperature: req.Options.Temperature,
+			TopP:        req.Options.TopP,
+			MinP:        req.Options.MinP,
+			TopK:        req.Options.TopK,
+			NumPredict:  req.Options.NumPredict,
 		}
 	}

@@ -312,25 +296,6 @@ func (c *Client) Completion(ctx context.Context, req llm.CompletionRequest, fn f
 	return scanner.Err()
 }

-func hasExplicitOption(explicit map[string]struct{}, key string) bool {
-	_, ok := explicit[key]
-	return ok
-}
-
-func float32Ptr(v float32, ok bool) *float32 {
-	if !ok {
-		return nil
-	}
-	return &v
-}
-
-func intPtr(v int, ok bool) *int {
-	if !ok {
-		return nil
-	}
-	return &v
-}
-
 func (c *Client) ContextLength() int {
 	return int(c.contextLength.Load())
 }
--- a/x/mlxrunner/client_test.go
+++ b/x/mlxrunner/client_test.go
@@ -1,167 +0,0 @@
-package mlxrunner
-
-import (
-	"context"
-	"encoding/json"
-	"io"
-	"net/http"
-	"strings"
-	"testing"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/llm"
-)
-
-func TestCompletionForwardsThink(t *testing.T) {
-	boolPtr := func(v bool) *bool { return &v }
-
-	testCases := []struct {
-		name  string
-		think *api.ThinkValue
-		want  *bool
-	}{
-		{name: "unset", think: nil, want: nil},
-		{name: "enabled", think: &api.ThinkValue{Value: true}, want: boolPtr(true)},
-		{name: "disabled", think: &api.ThinkValue{Value: false}, want: boolPtr(false)},
-		{name: "level maps to enabled", think: &api.ThinkValue{Value: "high"}, want: boolPtr(true)},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			var got completionRequest
-
-			rt := roundTripFunc(func(r *http.Request) (*http.Response, error) {
-				if r.URL.Path != "/completion" {
-					t.Fatalf("request path = %q, want %q", r.URL.Path, "/completion")
-				}
-
-				if err := json.NewDecoder(r.Body).Decode(&got); err != nil {
-					return nil, err
-				}
-
-				return &http.Response{
-					StatusCode: http.StatusOK,
-					Header:     make(http.Header),
-					Body:       io.NopCloser(strings.NewReader("{\"done\":true}\n")),
-					Request:    r,
-				}, nil
-			})
-
-			c := &Client{
-				port: 11434,
-				client: &http.Client{
-					Transport: rt,
-				},
-			}
-
-			err := c.Completion(context.Background(), llm.CompletionRequest{
-				Prompt: "hello",
-				Think:  tc.think,
-			}, func(llm.CompletionResponse) {})
-			if err != nil {
-				t.Fatalf("completion request failed: %v", err)
-			}
-
-			if got.Prompt != "hello" {
-				t.Fatalf("prompt = %q, want %q", got.Prompt, "hello")
-			}
-
-			switch {
-			case tc.want == nil && got.Think != nil:
-				t.Fatalf("think = %v, want nil", *got.Think)
-			case tc.want != nil && got.Think == nil:
-				t.Fatalf("think = nil, want %v", *tc.want)
-			case tc.want != nil && got.Think != nil && *tc.want != *got.Think:
-				t.Fatalf("think = %v, want %v", *got.Think, *tc.want)
-			}
-		})
-	}
-}
-
-func TestCompletionForwardsOnlySpecifiedSamplingOptions(t *testing.T) {
-	var got completionRequest
-
-	rt := roundTripFunc(func(r *http.Request) (*http.Response, error) {
-		if err := json.NewDecoder(r.Body).Decode(&got); err != nil {
-			return nil, err
-		}
-
-		return &http.Response{
-			StatusCode: http.StatusOK,
-			Header:     make(http.Header),
-			Body:       io.NopCloser(strings.NewReader("{\"done\":true}\n")),
-			Request:    r,
-		}, nil
-	})
-
-	c := &Client{
-		port: 11434,
-		client: &http.Client{
-			Transport: rt,
-		},
-	}
-
-	opts := &api.Options{
-		Temperature:      1.0,
-		TopP:             0.95,
-		MinP:             0.1,
-		TopK:             20,
-		RepeatLastN:      128,
-		RepeatPenalty:    1.2,
-		PresencePenalty:  1.5,
-		FrequencyPenalty: 0.25,
-		NumPredict:       64,
-	}
-
-	err := c.Completion(context.Background(), llm.CompletionRequest{
-		Prompt:  "hello",
-		Options: opts,
-		ExplicitOptions: map[string]struct{}{
-			"temperature":      {},
-			"top_k":            {},
-			"repeat_penalty":   {},
-			"presence_penalty": {},
-		},
-	}, func(llm.CompletionResponse) {})
-	if err != nil {
-		t.Fatalf("completion request failed: %v", err)
-	}
-
-	if got.Options == nil {
-		t.Fatal("options = nil, want serialized options")
-	}
-
-	if got.Options.Temperature == nil || *got.Options.Temperature != opts.Temperature {
-		t.Fatalf("temperature = %v, want %v", got.Options.Temperature, opts.Temperature)
-	}
-	if got.Options.TopK == nil || *got.Options.TopK != opts.TopK {
-		t.Fatalf("top_k = %v, want %v", got.Options.TopK, opts.TopK)
-	}
-	if got.Options.RepeatPenalty == nil || *got.Options.RepeatPenalty != opts.RepeatPenalty {
-		t.Fatalf("repeat_penalty = %v, want %v", got.Options.RepeatPenalty, opts.RepeatPenalty)
-	}
-	if got.Options.PresencePenalty == nil || *got.Options.PresencePenalty != opts.PresencePenalty {
-		t.Fatalf("presence_penalty = %v, want %v", got.Options.PresencePenalty, opts.PresencePenalty)
-	}
-	if got.Options.TopP != nil {
-		t.Fatalf("top_p = %v, want nil", *got.Options.TopP)
-	}
-	if got.Options.MinP != nil {
-		t.Fatalf("min_p = %v, want nil", *got.Options.MinP)
-	}
-	if got.Options.RepeatLastN != nil {
-		t.Fatalf("repeat_last_n = %v, want nil", *got.Options.RepeatLastN)
-	}
-	if got.Options.FrequencyPenalty != nil {
-		t.Fatalf("frequency_penalty = %v, want nil", *got.Options.FrequencyPenalty)
-	}
-	if got.Options.NumPredict != opts.NumPredict {
-		t.Fatalf("num_predict = %d, want %d", got.Options.NumPredict, opts.NumPredict)
-	}
-}
-
-type roundTripFunc func(*http.Request) (*http.Response, error)
-
-func (f roundTripFunc) RoundTrip(r *http.Request) (*http.Response, error) {
-	return f(r)
-}
--- a/x/mlxrunner/mlx/gated_delta_metal.go
+++ b/x/mlxrunner/mlx/gated_delta_metal.go
@@ -8,14 +8,13 @@ import "C"

 import (
 	"sync"
-	"sync/atomic"
 	"unsafe"
 )

 var (
 	gatedDeltaMetalKernelOnce sync.Once
 	gatedDeltaMetalKernel     C.mlx_fast_metal_kernel
-	gatedDeltaMetalDisabled   atomic.Bool
+	gatedDeltaMetalDisabled   bool
 )

 const gatedDeltaMetalKernelSource = `
@@ -108,7 +107,7 @@ func cStringVector(values []string) (C.mlx_vector_string, func(), bool) {
 func initGatedDeltaMetalKernel() {
 	inputs, freeInputs, ok := cStringVector([]string{"q", "k", "v", "g", "beta", "state_in", "T"})
 	if !ok {
-		gatedDeltaMetalDisabled.Store(true)
+		gatedDeltaMetalDisabled = true
 		freeInputs()
 		return
 	}
@@ -116,7 +115,7 @@ func initGatedDeltaMetalKernel() {

 	outputs, freeOutputs, ok := cStringVector([]string{"y", "state_out"})
 	if !ok {
-		gatedDeltaMetalDisabled.Store(true)
+		gatedDeltaMetalDisabled = true
 		freeOutputs()
 		return
 	}
@@ -143,7 +142,7 @@ func initGatedDeltaMetalKernel() {
 // GatedDeltaKernel runs a fused Metal kernel for the qwen3.5 recurrent update.
 // It returns ok=false on unsupported shapes/devices or kernel setup/apply failure.
 func GatedDeltaKernel(q, k, v, g, beta, state *Array) (y, nextState *Array, ok bool) {
-	if gatedDeltaMetalDisabled.Load() {
+	if gatedDeltaMetalDisabled {
 		return nil, nil, false
 	}
 	if q == nil || k == nil || v == nil || g == nil || beta == nil || state == nil {
@@ -190,7 +189,7 @@ func GatedDeltaKernel(q, k, v, g, beta, state *Array) (y, nextState *Array, ok b
 	}

 	gatedDeltaMetalKernelOnce.Do(initGatedDeltaMetalKernel)
-	if gatedDeltaMetalDisabled.Load() {
+	if gatedDeltaMetalDisabled {
 		return nil, nil, false
 	}

@@ -200,7 +199,7 @@ func GatedDeltaKernel(q, k, v, g, beta, state *Array) (y, nextState *Array, ok b
 	cInT := C.CString("InT")
 	defer C.free(unsafe.Pointer(cInT))
 	if C.mlx_fast_metal_kernel_config_add_template_arg_dtype(cfg, cInT, C.mlx_dtype(dtype)) != 0 {
-		gatedDeltaMetalDisabled.Store(true)
+		gatedDeltaMetalDisabled = true
 		return nil, nil, false
 	}
 	for _, tpl := range []struct {
@@ -216,7 +215,7 @@ func GatedDeltaKernel(q, k, v, g, beta, state *Array) (y, nextState *Array, ok b
 		rc := C.mlx_fast_metal_kernel_config_add_template_arg_int(cfg, cn, C.int(tpl.value))
 		C.free(unsafe.Pointer(cn))
 		if rc != 0 {
-			gatedDeltaMetalDisabled.Store(true)
+			gatedDeltaMetalDisabled = true
 			return nil, nil, false
 		}
 	}
@@ -224,15 +223,15 @@ func GatedDeltaKernel(q, k, v, g, beta, state *Array) (y, nextState *Array, ok b
 	yShape := []C.int{C.int(B), C.int(T), C.int(Hv), C.int(Dv)}
 	stateShape := []C.int{C.int(B), C.int(Hv), C.int(Dv), C.int(Dk)}
 	if C.mlx_fast_metal_kernel_config_add_output_arg(cfg, unsafe.SliceData(yShape), C.size_t(len(yShape)), C.mlx_dtype(dtype)) != 0 {
-		gatedDeltaMetalDisabled.Store(true)
+		gatedDeltaMetalDisabled = true
 		return nil, nil, false
 	}
 	if C.mlx_fast_metal_kernel_config_add_output_arg(cfg, unsafe.SliceData(stateShape), C.size_t(len(stateShape)), C.mlx_dtype(dtype)) != 0 {
-		gatedDeltaMetalDisabled.Store(true)
+		gatedDeltaMetalDisabled = true
 		return nil, nil, false
 	}
 	if C.mlx_fast_metal_kernel_config_set_grid(cfg, 32, C.int(Dv), C.int(B*Hv)) != 0 {
-		gatedDeltaMetalDisabled.Store(true)
+		gatedDeltaMetalDisabled = true
 		return nil, nil, false
 	}
 	threadY := Dv
@@ -240,7 +239,7 @@ func GatedDeltaKernel(q, k, v, g, beta, state *Array) (y, nextState *Array, ok b
 		threadY = 4
 	}
 	if C.mlx_fast_metal_kernel_config_set_thread_group(cfg, 32, C.int(threadY), 1) != 0 {
-		gatedDeltaMetalDisabled.Store(true)
+		gatedDeltaMetalDisabled = true
 		return nil, nil, false
 	}

@@ -260,7 +259,7 @@ func GatedDeltaKernel(q, k, v, g, beta, state *Array) (y, nextState *Array, ok b
 	outVec := C.mlx_vector_array_new()
 	defer C.mlx_vector_array_free(outVec)
 	if C.mlx_fast_metal_kernel_apply(&outVec, gatedDeltaMetalKernel, inVec, cfg, DefaultStream().ctx) != 0 {
-		gatedDeltaMetalDisabled.Store(true)
+		gatedDeltaMetalDisabled = true
 		return nil, nil, false
 	}
 	if int(C.mlx_vector_array_size(outVec)) < 2 {
@@ -273,3 +272,101 @@ func GatedDeltaKernel(q, k, v, g, beta, state *Array) (y, nextState *Array, ok b
 	C.mlx_vector_array_get(&nextState.ctx, outVec, 1)
 	return y, nextState, true
 }
+
+func repeatHeadsForGatedDelta(x *Array, repeatFactor int) *Array {
+	if repeatFactor <= 1 {
+		return x
+	}
+	shape := x.Dims()
+	x = ExpandDims(x, 3)
+	x = Tile(x, []int32{1, 1, 1, int32(repeatFactor), 1})
+	return Reshape(x, int32(shape[0]), int32(shape[1]), int32(shape[2]*repeatFactor), int32(shape[3]))
+}
+
+func gatedDeltaFallback(q, k, v, g, beta, state *Array) (y, nextState *Array) {
+	if q == nil || k == nil || v == nil || g == nil || beta == nil || state == nil {
+		return nil, nil
+	}
+	if !q.Valid() || !k.Valid() || !v.Valid() || !g.Valid() || !beta.Valid() || !state.Valid() {
+		return nil, nil
+	}
+
+	qd := q.Dims()
+	kd := k.Dims()
+	vd := v.Dims()
+	gd := g.Dims()
+	bd := beta.Dims()
+	sd := state.Dims()
+	if len(qd) != 4 || len(kd) != 4 || len(vd) != 4 || len(gd) != 3 || len(bd) != 3 || len(sd) != 4 {
+		return nil, nil
+	}
+
+	B, T, Hk, Dk := int32(qd[0]), int32(qd[1]), int32(qd[2]), int32(qd[3])
+	Hv, Dv := int32(vd[2]), int32(vd[3])
+	if T <= 0 || Hk <= 0 || Dk <= 0 || Hv <= 0 || Dv <= 0 || Hv%Hk != 0 {
+		return nil, nil
+	}
+	if kd[0] != int(B) || kd[1] != int(T) || kd[2] != int(Hk) || kd[3] != int(Dk) {
+		return nil, nil
+	}
+	if vd[0] != int(B) || vd[1] != int(T) {
+		return nil, nil
+	}
+	if gd[0] != int(B) || gd[1] != int(T) || gd[2] != int(Hv) {
+		return nil, nil
+	}
+	if bd[0] != int(B) || bd[1] != int(T) || bd[2] != int(Hv) {
+		return nil, nil
+	}
+	if sd[0] != int(B) || sd[1] != int(Hv) || sd[2] != int(Dv) || sd[3] != int(Dk) {
+		return nil, nil
+	}
+
+	repeatFactor := int(Hv / Hk)
+	q = repeatHeadsForGatedDelta(q, repeatFactor)
+	k = repeatHeadsForGatedDelta(k, repeatFactor)
+
+	nextState = state
+	if T == 1 {
+		qt := Squeeze(q, 1)
+		kt := Squeeze(k, 1)
+		vt := Squeeze(v, 1)
+		gt := Squeeze(g, 1)
+		bt := Squeeze(beta, 1)
+
+		nextState = Mul(nextState, ExpandDims(ExpandDims(gt, -1), -1))
+		kvMem := Sum(Mul(nextState, ExpandDims(kt, 2)), -1, false)
+		delta := Mul(Sub(vt, kvMem), ExpandDims(bt, -1))
+		nextState = Add(nextState, Mul(ExpandDims(kt, 2), ExpandDims(delta, -1)))
+		yt := Sum(Mul(nextState, ExpandDims(qt, 2)), -1, false)
+		return ExpandDims(yt, 1), nextState
+	}
+
+	outs := make([]*Array, 0, T)
+	for t := int32(0); t < T; t++ {
+		qt := Squeeze(SliceStartStop(q, []int32{0, t, 0, 0}, []int32{B, t + 1, Hv, Dk}), 1)
+		kt := Squeeze(SliceStartStop(k, []int32{0, t, 0, 0}, []int32{B, t + 1, Hv, Dk}), 1)
+		vt := Squeeze(SliceStartStop(v, []int32{0, t, 0, 0}, []int32{B, t + 1, Hv, Dv}), 1)
+		gt := Squeeze(SliceStartStop(g, []int32{0, t, 0}, []int32{B, t + 1, Hv}), 1)
+		bt := Squeeze(SliceStartStop(beta, []int32{0, t, 0}, []int32{B, t + 1, Hv}), 1)
+
+		nextState = Mul(nextState, ExpandDims(ExpandDims(gt, -1), -1))
+		kvMem := Sum(Mul(nextState, ExpandDims(kt, 2)), -1, false)
+		delta := Mul(Sub(vt, kvMem), ExpandDims(bt, -1))
+		nextState = Add(nextState, Mul(ExpandDims(kt, 2), ExpandDims(delta, -1)))
+		yt := Sum(Mul(nextState, ExpandDims(qt, 2)), -1, false)
+		outs = append(outs, ExpandDims(yt, 1))
+	}
+	return Concatenate(outs, 1), nextState
+}
+
+// GatedDelta runs the recurrent update operation.
+//
+// It uses the fused Metal kernel when available and otherwise falls back to a
+// backend-agnostic MLX implementation with identical inputs/outputs.
+func GatedDelta(q, k, v, g, beta, state *Array) (y, nextState *Array) {
+	if y, nextState, ok := GatedDeltaKernel(q, k, v, g, beta, state); ok {
+		return y, nextState
+	}
+	return gatedDeltaFallback(q, k, v, g, beta, state)
+}
--- a/x/mlxrunner/mlx/ops.go
+++ b/x/mlxrunner/mlx/ops.go
@@ -93,12 +93,6 @@ func (t *Array) Divide(other *Array) *Array {
 	return out
 }

-func (t *Array) Cumsum(axis int, reverse, inclusive bool) *Array {
-	out := New("CUMSUM")
-	C.mlx_cumsum(&out.ctx, t.ctx, C.int(axis), C.bool(reverse), C.bool(inclusive), DefaultStream().ctx)
-	return out
-}
-
 func (t *Array) ExpandDims(axis int) *Array {
 	out := New("EXPAND_DIMS")
 	C.mlx_expand_dims(&out.ctx, t.ctx, C.int(axis), DefaultStream().ctx)
@@ -129,30 +123,12 @@ func (t *Array) GatherMM(other, lhs, rhs *Array, sorted bool) *Array {
 	return out
 }

-func (t *Array) GreaterEqual(other *Array) *Array {
-	out := New("GREATER_EQUAL")
-	C.mlx_greater_equal(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
-	return out
-}
-
 func (t *Array) Logsumexp(keepDims bool) *Array {
 	out := New("LOGSUMEXP")
 	C.mlx_logsumexp(&out.ctx, t.ctx, C.bool(keepDims), DefaultStream().ctx)
 	return out
 }

-func (t *Array) Less(other *Array) *Array {
-	out := New("LESS")
-	C.mlx_less(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
-	return out
-}
-
-func (t *Array) LogicalOr(other *Array) *Array {
-	out := New("LOGICAL_OR")
-	C.mlx_logical_or(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
-	return out
-}
-
 func (t *Array) Matmul(other *Array) *Array {
 	out := New("MATMUL")
 	C.mlx_matmul(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
--- a/x/mlxrunner/mlx/ops_extra.go
+++ b/x/mlxrunner/mlx/ops_extra.go
@@ -429,27 +429,16 @@ func Collect(v any) []*Array {
 	return arrays
 }

-// Snapshot copies an array into a fresh leaf value with no Go-side graph inputs.
-func Snapshot(a *Array) *Array {
+// Copy copies an array into a fresh leaf value with no Go-side graph inputs.
+func Copy(a *Array) *Array {
 	if a == nil || !a.Valid() {
 		return a
 	}
-	out := New("SNAPSHOT")
+	out := New("COPY")
 	C.mlx_copy(&out.ctx, a.ctx, DefaultStream().ctx)
 	return out
 }

-// Detach returns a new Array handle that shares the same MLX value but does
-// not retain Go-side graph input references.
-func Detach(a *Array) *Array {
-	if a == nil || !a.Valid() {
-		return a
-	}
-	out := New("DETACH")
-	C.mlx_array_set(&out.ctx, a.ctx)
-	return out
-}
-
 func collect(v reflect.Value, arrays *[]*Array, seen map[uintptr]bool) {
 	if !v.IsValid() {
 		return
--- a/x/mlxrunner/pipeline.go
+++ b/x/mlxrunner/pipeline.go
@@ -26,10 +26,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 	}

 	ctx := request.Ctx
-	if ctx == nil {
-		ctx = context.Background()
-	}
-
 	var (
 		sample, logprobs         *mlx.Array
 		nextSample, nextLogprobs *mlx.Array
@@ -82,7 +78,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 	defer session.close()
 	caches := session.caches
 	tokens := session.remaining
-	history := append([]int32(nil), session.inputs...)
 	prefillChunk := prefillChunkSize()

 	materializeCaches := func() {
@@ -115,13 +110,13 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 		mlx.ClearCache()
 	}

-	step := func(token *mlx.Array, history []int32) (*mlx.Array, *mlx.Array) {
+	step := func(token *mlx.Array) (*mlx.Array, *mlx.Array) {
 		fwd := r.Model.Forward(token.ExpandDims(0), caches)
 		logits := r.Model.Unembed(fwd)
 		logits = logits.Slice(mlx.Slice(), mlx.Slice(logits.Dim(1)-1), mlx.Slice()).Squeeze(1)

 		logprobs := logits.Subtract(logits.Logsumexp(true))
-		sample := request.Sample(logprobs, history)
+		sample := request.Sample(logprobs)

 		mlx.Pin(sample, logprobs)
 		mlx.Sweep()
@@ -130,7 +125,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 		return sample, logprobs
 	}

-	sample, logprobs = step(mlx.FromValues(tokens[processed:], total-processed), history)
+	sample, logprobs = step(mlx.FromValues(tokens[processed:], total-processed))

 	var b bytes.Buffer

@@ -140,6 +135,8 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 			return err
 		}

+		nextSample, nextLogprobs = step(sample)
+
 		if i == 0 {
 			mlx.Eval(sample)
 			final.PromptEvalDuration = time.Since(now)
@@ -148,7 +145,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {

 		output := int32(sample.Int())
 		session.outputs = append(session.outputs, output)
-		history = append(history, output)

 		if r.Tokenizer.IsEOS(output) {
 			final.DoneReason = 0
@@ -157,15 +153,13 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 		}

 		select {
-		case <-request.Ctx.Done():
-			return request.Ctx.Err()
+		case <-ctx.Done():
+			return ctx.Err()
 		case request.Responses <- CompletionResponse{
 			Content: r.Decode(output, &b),
 		}:
 		}

-		nextSample, nextLogprobs = step(sample, history)
-
 		mlx.Unpin(sample, logprobs)
 		sample, logprobs = nextSample, nextLogprobs
 		nextSample, nextLogprobs = nil, nil
--- a/x/mlxrunner/runner.go
+++ b/x/mlxrunner/runner.go
@@ -32,17 +32,12 @@ type Request struct {

 type TextCompletionsRequest struct {
 	Prompt  string `json:"prompt"`
-	Think   *bool  `json:"think,omitempty"`
 	Options struct {
-		Temperature      *float32 `json:"temperature"`
-		TopP             *float32 `json:"top_p"`
-		MinP             *float32 `json:"min_p"`
-		TopK             *int     `json:"top_k"`
-		RepeatLastN      *int     `json:"repeat_last_n"`
-		RepeatPenalty    *float32 `json:"repeat_penalty"`
-		PresencePenalty  *float32 `json:"presence_penalty"`
-		FrequencyPenalty *float32 `json:"frequency_penalty"`
-		MaxTokens        int      `json:"max_tokens"`
+		Temperature float32 `json:"temperature"`
+		TopP        float32 `json:"top_p"`
+		MinP        float32 `json:"min_p"`
+		TopK        int     `json:"top_k"`
+		MaxTokens   int     `json:"max_tokens"`

 		// Deprecated: use MaxTokens instead
 		NumPredict int `json:"num_predict"`
--- a/x/mlxrunner/sample/sample.go
+++ b/x/mlxrunner/sample/sample.go
@@ -9,204 +9,69 @@ import (
 )

 type Sampler interface {
-	Sample(*mlx.Array, []int32) *mlx.Array
+	Sample(*mlx.Array) *mlx.Array
 }

-func New(temp, top_p, min_p float32, top_k, repeatLastN int, repeatPenalty, presencePenalty, frequencyPenalty float32) Sampler {
-	var samplers []Sampler
-	if repeatLastN > 0 && (repeatPenalty != 1 || presencePenalty != 0 || frequencyPenalty != 0) {
-		samplers = append(samplers, Penalty{
-			RepeatLastN:      repeatLastN,
-			RepeatPenalty:    repeatPenalty,
-			PresencePenalty:  presencePenalty,
-			FrequencyPenalty: frequencyPenalty,
-		})
+func New(temp, top_p, min_p float32, top_k int) Sampler {
+	if temp == 0 {
+		return greedy{}
 	}

-	if temp == 0 {
-		samplers = append(samplers, greedy{})
-	} else {
-		samplers = append(samplers, Distribution{
-			Temperature: temp,
-			TopK:        top_k,
-			TopP:        top_p,
-			MinP:        min_p,
-		})
+	var samplers []Sampler
+	if top_p > 0 && top_p < 1 {
+		samplers = append(samplers, TopP(top_p))
 	}
+
+	if min_p != 0 {
+		samplers = append(samplers, MinP(min_p))
+	}
+
+	if top_k > 0 {
+		samplers = append(samplers, TopK(top_k))
+	}
+
+	samplers = append(samplers, Temperature(temp))
 	return chain(samplers)
 }

 type greedy struct{}

-func (greedy) Sample(logits *mlx.Array, _ []int32) *mlx.Array {
+func (greedy) Sample(logits *mlx.Array) *mlx.Array {
 	return logits.Argmax(-1, false)
 }

 type chain []Sampler

-func (c chain) Sample(logits *mlx.Array, history []int32) *mlx.Array {
+func (c chain) Sample(logits *mlx.Array) *mlx.Array {
 	for _, sampler := range c {
-		logits = sampler.Sample(logits, history)
+		logits = sampler.Sample(logits)
 	}
 	return logits
 }

-type Distribution struct {
-	Temperature float32
-	TopK        int
-	TopP        float32
-	MinP        float32
+type Temperature float32
+
+func (t Temperature) Sample(logits *mlx.Array) *mlx.Array {
+	return mlx.DivScalar(logits, float32(t)).Categorical(-1)
 }

-func (d Distribution) Sample(logits *mlx.Array, _ []int32) *mlx.Array {
-	filtered, indices := d.filter(logits)
-	sample := filtered.Categorical(-1)
-	if indices == nil {
-		return sample
-	}
+type TopP float32

-	positions := sample.ExpandDims(1)
-	return indices.TakeAlongAxis(positions, -1).Squeeze(1)
+func (p TopP) Sample(logprobs *mlx.Array) *mlx.Array {
+	// TODO: implement
+	return logprobs
 }

-func (d Distribution) filter(logits *mlx.Array) (*mlx.Array, *mlx.Array) {
-	candidates := logits
-	var candidateIndices *mlx.Array
+type MinP float32

-	if d.TopK > 0 && d.TopK < logits.Dim(logits.NumDims()-1) {
-		partitions := logits.Negative().ArgpartitionAxis(d.TopK-1, -1)
-		switch logits.NumDims() {
-		case 1:
-			candidateIndices = partitions.Slice(mlx.Slice(0, d.TopK))
-		default:
-			candidateIndices = partitions.Slice(mlx.Slice(), mlx.Slice(0, d.TopK))
-		}
-		candidates = logits.TakeAlongAxis(candidateIndices, -1)
-	}
-
-	if d.Temperature != 1 {
-		candidates = mlx.DivScalar(candidates, d.Temperature)
-	}
-
-	if !d.needsProbabilityFilters() {
-		return candidates, candidateIndices
-	}
-
-	order := candidates.Negative().ArgsortAxis(-1)
-	sortedLogits := candidates.TakeAlongAxis(order, -1)
-	sortedProbs := mlx.SoftmaxAxis(candidates, -1, true).TakeAlongAxis(order, -1)
-
-	remove := d.topPRemovalMask(sortedProbs)
-	if d.MinP > 0 {
-		minPRemove := d.minPRemovalMask(sortedProbs)
-		if remove == nil {
-			remove = minPRemove
-		} else {
-			remove = remove.LogicalOr(minPRemove)
-		}
-	}
-
-	if remove == nil {
-		return candidates, candidateIndices
-	}
-
-	negInf := mlx.FromValue(float32(math.Inf(-1)))
-	filtered := mlx.Where(remove, negInf, sortedLogits)
-	return candidates.PutAlongAxis(order, filtered, -1), candidateIndices
+func (p MinP) Sample(logprobs *mlx.Array) *mlx.Array {
+	// TODO: implement
+	return logprobs
 }

-func (d Distribution) needsProbabilityFilters() bool {
-	return (d.TopP > 0 && d.TopP < 1) || d.MinP > 0
-}
-
-func (d Distribution) topPRemovalMask(sortedProbs *mlx.Array) *mlx.Array {
-	if d.TopP <= 0 || d.TopP >= 1 {
-		return nil
-	}
-
-	threshold := mlx.NewScalarArray(d.TopP)
-	prevCum := sortedProbs.Cumsum(-1, false, true).Subtract(sortedProbs)
-	return prevCum.GreaterEqual(threshold)
-}
-
-func (d Distribution) minPRemovalMask(sortedProbs *mlx.Array) *mlx.Array {
-	if d.MinP <= 0 {
-		return nil
-	}
-
-	var maxProb *mlx.Array
-	switch sortedProbs.NumDims() {
-	case 1:
-		maxProb = sortedProbs.Slice(mlx.Slice(0, 1))
-	default:
-		maxProb = sortedProbs.Slice(mlx.Slice(), mlx.Slice(0, 1))
-	}
-
-	threshold := mlx.MulScalar(maxProb, d.MinP)
-	return sortedProbs.Less(threshold)
-}
-
-type Penalty struct {
-	RepeatLastN      int
-	RepeatPenalty    float32
-	PresencePenalty  float32
-	FrequencyPenalty float32
-}
-
-func (p Penalty) Sample(logprobs *mlx.Array, history []int32) *mlx.Array {
-	if len(history) == 0 {
-		return logprobs
-	}
-
-	window := p.RepeatLastN
-	if window <= 0 || window > len(history) {
-		window = len(history)
-	}
-
-	counts := make(map[int32]int, window)
-	order := make([]int32, 0, window)
-	for _, token := range history[len(history)-window:] {
-		if token < 0 {
-			continue
-		}
-		if counts[token] == 0 {
-			order = append(order, token)
-		}
-		counts[token]++
-	}
-	if len(order) == 0 {
-		return logprobs
-	}
-
-	indexShape := []int32{int32(len(order))}
-	valueShape := []int{len(order)}
-	if logprobs.NumDims() > 1 {
-		indexShape = []int32{1, int32(len(order))}
-		valueShape = []int{1, len(order)}
-	}
-
-	indices := mlx.NewArrayInt32(order, indexShape)
-	selected := logprobs.TakeAlongAxis(indices, -1)
-	mlx.Eval(selected)
-
-	values := selected.Floats()
-	for i, token := range order {
-		v := values[i]
-		if p.RepeatPenalty != 1 {
-			if v < 0 {
-				v *= p.RepeatPenalty
-			} else {
-				v /= p.RepeatPenalty
-			}
-		}
-		if p.PresencePenalty != 0 {
-			v -= p.PresencePenalty
-		}
-		if p.FrequencyPenalty != 0 {
-			v -= p.FrequencyPenalty * float32(counts[token])
-		}
-		values[i] = v
-	}
-
-	return logprobs.PutAlongAxis(indices, mlx.FromValues(values, valueShape...), -1)
+type TopK int
+
+func (k TopK) Sample(logprobs *mlx.Array) *mlx.Array {
+	mask := logprobs.Negative().ArgpartitionAxis(int(k)-1, -1).Slice(mlx.Slice(), mlx.Slice(int(k), 0))
+	return logprobs.PutAlongAxis(mask, mlx.FromValue(float32(math.Inf(-1))), -1)
 }
--- a/x/mlxrunner/sample/sample_test.go
+++ b/x/mlxrunner/sample/sample_test.go
@@ -1,104 +0,0 @@
-//go:build mlx
-
-package sample
-
-import (
-	"math"
-	"testing"
-
-	"github.com/ollama/ollama/x/mlxrunner/mlx"
-)
-
-func TestPenaltySample(t *testing.T) {
-	if err := mlx.CheckInit(); err != nil {
-		t.Skipf("MLX not available: %v", err)
-	}
-
-	logprobs := mlx.FromValues([]float32{
-		1.0, -2.0, 3.0, 4.0,
-	}, 1, 4)
-
-	got := Penalty{
-		RepeatLastN:      3,
-		RepeatPenalty:    2.0,
-		PresencePenalty:  1.5,
-		FrequencyPenalty: 0.25,
-	}.Sample(logprobs, []int32{2, 1, 2})
-
-	mlx.Eval(got)
-
-	want := []float32{1.0, -5.75, -0.5, 4.0}
-	values := got.Floats()
-	if len(values) != len(want) {
-		t.Fatalf("len(values) = %d, want %d", len(values), len(want))
-	}
-
-	for i := range want {
-		if math.Abs(float64(values[i]-want[i])) > 1e-5 {
-			t.Fatalf("values[%d] = %v, want %v", i, values[i], want[i])
-		}
-	}
-}
-
-func TestPenaltySampleHonorsRepeatWindow(t *testing.T) {
-	if err := mlx.CheckInit(); err != nil {
-		t.Skipf("MLX not available: %v", err)
-	}
-
-	logprobs := mlx.FromValues([]float32{
-		1.0, 2.0, 3.0,
-	}, 1, 3)
-
-	got := Penalty{
-		RepeatLastN:     1,
-		PresencePenalty: 1.0,
-	}.Sample(logprobs, []int32{0, 1})
-
-	mlx.Eval(got)
-
-	want := []float32{1.0, 1.0, 3.0}
-	values := got.Floats()
-	for i := range want {
-		if math.Abs(float64(values[i]-want[i])) > 1e-5 {
-			t.Fatalf("values[%d] = %v, want %v", i, values[i], want[i])
-		}
-	}
-}
-
-func TestDistributionFilterTopP(t *testing.T) {
-	if err := mlx.CheckInit(); err != nil {
-		t.Skipf("MLX not available: %v", err)
-	}
-
-	logits := mlx.FromValues([]float32{
-		10.0, 9.0, 1.0, 0.0,
-	}, 1, 4)
-
-	filtered, indices := Distribution{
-		Temperature: 1.0,
-		TopK:        2,
-		TopP:        0.55,
-	}.filter(logits)
-
-	got := materializeFilteredLogits(filtered, indices, 4)
-	mlx.Eval(got)
-
-	values := got.Floats()
-	if values[0] != 10.0 {
-		t.Fatalf("values[0] = %v, want 10", values[0])
-	}
-	for i := 1; i < len(values); i++ {
-		if !math.IsInf(float64(values[i]), -1) {
-			t.Fatalf("values[%d] = %v, want -Inf", i, values[i])
-		}
-	}
-}
-
-func materializeFilteredLogits(filtered, indices *mlx.Array, width int) *mlx.Array {
-	if indices == nil {
-		return filtered
-	}
-
-	base := mlx.AddScalar(mlx.Zeros(mlx.DTypeFloat32, 1, width), float32(math.Inf(-1)))
-	return base.PutAlongAxis(indices, filtered, -1)
-}
--- a/x/mlxrunner/server.go
+++ b/x/mlxrunner/server.go
@@ -16,89 +16,12 @@ import (
 	"strconv"
 	"time"

-	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
-	"github.com/ollama/ollama/x/mlxrunner/model/base"
 	"github.com/ollama/ollama/x/mlxrunner/sample"
-	"github.com/ollama/ollama/x/models/qwen3_5"
 )

-type samplingConfig struct {
-	temperature      float32
-	topP             float32
-	minP             float32
-	topK             int
-	repeatLastN      int
-	repeatPenalty    float32
-	presencePenalty  float32
-	frequencyPenalty float32
-}
-
-func defaultSamplingConfig(m base.Model, think *bool) samplingConfig {
-	if _, ok := m.(*qwen3_5.Model); ok {
-		cfg := samplingConfig{
-			temperature:      1.0,
-			topP:             0.95,
-			minP:             0.0,
-			topK:             20,
-			repeatLastN:      64,
-			repeatPenalty:    1.0,
-			presencePenalty:  1.5,
-			frequencyPenalty: 0.0,
-		}
-		if think != nil && !*think {
-			cfg.temperature = 0.7
-			cfg.topP = 0.8
-		}
-		return cfg
-	}
-
-	opts := api.DefaultOptions()
-	return samplingConfig{
-		temperature:      opts.Temperature,
-		topP:             opts.TopP,
-		minP:             opts.MinP,
-		topK:             opts.TopK,
-		repeatLastN:      opts.RepeatLastN,
-		repeatPenalty:    opts.RepeatPenalty,
-		presencePenalty:  opts.PresencePenalty,
-		frequencyPenalty: opts.FrequencyPenalty,
-	}
-}
-
-func resolveSamplingConfig(m base.Model, req Request) samplingConfig {
-	cfg := defaultSamplingConfig(m, req.Think)
-
-	if req.Options.Temperature != nil {
-		cfg.temperature = *req.Options.Temperature
-	}
-	if req.Options.TopP != nil {
-		cfg.topP = *req.Options.TopP
-	}
-	if req.Options.MinP != nil {
-		cfg.minP = *req.Options.MinP
-	}
-	if req.Options.TopK != nil {
-		cfg.topK = *req.Options.TopK
-	}
-	if req.Options.RepeatLastN != nil {
-		cfg.repeatLastN = *req.Options.RepeatLastN
-	}
-	if req.Options.RepeatPenalty != nil {
-		cfg.repeatPenalty = *req.Options.RepeatPenalty
-	}
-	if req.Options.PresencePenalty != nil {
-		cfg.presencePenalty = *req.Options.PresencePenalty
-	}
-	if req.Options.FrequencyPenalty != nil {
-		cfg.frequencyPenalty = *req.Options.FrequencyPenalty
-	}
-
-	return cfg
-}
-
 func Execute(args []string) error {
 	slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))

@@ -167,18 +90,12 @@ func Execute(args []string) error {

 		request.Options.MaxTokens = cmp.Or(request.Options.MaxTokens, request.Options.NumPredict)

-		sampling := resolveSamplingConfig(runner.Model, request)
-
 		request.Pipeline = runner.TextGenerationPipeline
 		request.Sampler = sample.New(
-			sampling.temperature,
-			sampling.topP,
-			sampling.minP,
-			sampling.topK,
-			sampling.repeatLastN,
-			sampling.repeatPenalty,
-			sampling.presencePenalty,
-			sampling.frequencyPenalty,
+			request.Options.Temperature,
+			request.Options.TopP,
+			request.Options.MinP,
+			request.Options.TopK,
 		)

 		var cancel context.CancelFunc
--- a/x/mlxrunner/server_test.go
+++ b/x/mlxrunner/server_test.go
@@ -1,172 +0,0 @@
-//go:build mlx
-
-package mlxrunner
-
-import (
-	"testing"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/x/mlxrunner/cache"
-	"github.com/ollama/ollama/x/mlxrunner/mlx"
-	"github.com/ollama/ollama/x/mlxrunner/model/base"
-	"github.com/ollama/ollama/x/models/qwen3_5"
-	"github.com/ollama/ollama/x/tokenizer"
-)
-
-type stubModel struct{}
-
-func (stubModel) Forward(*mlx.Array, []cache.Cache) *mlx.Array { return nil }
-func (stubModel) Unembed(*mlx.Array) *mlx.Array                { return nil }
-func (stubModel) NumLayers() int                               { return 0 }
-func (stubModel) Tokenizer() *tokenizer.Tokenizer              { return nil }
-func (stubModel) LoadWeights(map[string]*mlx.Array) error      { return nil }
-
-func TestResolveSamplingConfigDefaults(t *testing.T) {
-	trueValue := true
-	falseValue := false
-
-	tests := []struct {
-		name  string
-		model base.Model
-		req   Request
-		want  samplingConfig
-	}{
-		{
-			name:  "generic model uses api defaults",
-			model: stubModel{},
-			req:   Request{},
-			want: samplingConfig{
-				temperature:      0.8,
-				topP:             0.9,
-				minP:             0.0,
-				topK:             40,
-				repeatLastN:      64,
-				repeatPenalty:    1.1,
-				presencePenalty:  0.0,
-				frequencyPenalty: 0.0,
-			},
-		},
-		{
-			name:  "qwen3.5 defaults to thinking profile when think unset",
-			model: &qwen3_5.Model{},
-			req:   Request{},
-			want: samplingConfig{
-				temperature:      1.0,
-				topP:             0.95,
-				minP:             0.0,
-				topK:             20,
-				repeatLastN:      64,
-				repeatPenalty:    1.0,
-				presencePenalty:  1.5,
-				frequencyPenalty: 0.0,
-			},
-		},
-		{
-			name:  "qwen3.5 thinking disabled defaults",
-			model: &qwen3_5.Model{},
-			req:   Request{TextCompletionsRequest: TextCompletionsRequest{Think: &falseValue}},
-			want: samplingConfig{
-				temperature:      0.7,
-				topP:             0.8,
-				minP:             0.0,
-				topK:             20,
-				repeatLastN:      64,
-				repeatPenalty:    1.0,
-				presencePenalty:  1.5,
-				frequencyPenalty: 0.0,
-			},
-		},
-		{
-			name:  "qwen3.5 thinking enabled defaults",
-			model: &qwen3_5.Model{},
-			req:   Request{TextCompletionsRequest: TextCompletionsRequest{Think: &trueValue}},
-			want: samplingConfig{
-				temperature:      1.0,
-				topP:             0.95,
-				minP:             0.0,
-				topK:             20,
-				repeatLastN:      64,
-				repeatPenalty:    1.0,
-				presencePenalty:  1.5,
-				frequencyPenalty: 0.0,
-			},
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			if got := resolveSamplingConfig(tt.model, tt.req); got != tt.want {
-				t.Fatalf("resolveSamplingConfig() = %+v, want %+v", got, tt.want)
-			}
-		})
-	}
-}
-
-func TestResolveSamplingConfigOverridesSpecifiedValues(t *testing.T) {
-	trueValue := true
-	temperature := float32(0.4)
-	topP := float32(0.6)
-	minP := float32(0.05)
-	topK := 12
-	repeatLastN := 32
-	repeatPenalty := float32(1.1)
-	presencePenalty := float32(0.7)
-	frequencyPenalty := float32(0.2)
-
-	got := resolveSamplingConfig(stubModel{}, Request{
-		TextCompletionsRequest: TextCompletionsRequest{
-			Think: &trueValue,
-			Options: struct {
-				Temperature      *float32 `json:"temperature"`
-				TopP             *float32 `json:"top_p"`
-				MinP             *float32 `json:"min_p"`
-				TopK             *int     `json:"top_k"`
-				RepeatLastN      *int     `json:"repeat_last_n"`
-				RepeatPenalty    *float32 `json:"repeat_penalty"`
-				PresencePenalty  *float32 `json:"presence_penalty"`
-				FrequencyPenalty *float32 `json:"frequency_penalty"`
-				MaxTokens        int      `json:"max_tokens"`
-				NumPredict       int      `json:"num_predict"`
-			}{
-				Temperature:      &temperature,
-				TopP:             &topP,
-				MinP:             &minP,
-				TopK:             &topK,
-				RepeatLastN:      &repeatLastN,
-				RepeatPenalty:    &repeatPenalty,
-				PresencePenalty:  &presencePenalty,
-				FrequencyPenalty: &frequencyPenalty,
-			},
-		},
-	})
-
-	want := samplingConfig{
-		temperature:      temperature,
-		topP:             topP,
-		minP:             minP,
-		topK:             topK,
-		repeatLastN:      repeatLastN,
-		repeatPenalty:    repeatPenalty,
-		presencePenalty:  presencePenalty,
-		frequencyPenalty: frequencyPenalty,
-	}
-	if got != want {
-		t.Fatalf("resolveSamplingConfig() = %+v, want %+v", got, want)
-	}
-}
-
-func TestResolveSamplingConfigMatchesGenericDefaults(t *testing.T) {
-	want := api.DefaultOptions()
-	got := defaultSamplingConfig(stubModel{}, nil)
-
-	if got.temperature != want.Temperature ||
-		got.topP != want.TopP ||
-		got.minP != want.MinP ||
-		got.topK != want.TopK ||
-		got.repeatLastN != want.RepeatLastN ||
-		got.repeatPenalty != want.RepeatPenalty ||
-		got.presencePenalty != want.PresencePenalty ||
-		got.frequencyPenalty != want.FrequencyPenalty {
-		t.Fatalf("defaultSamplingConfig() = %+v, want api defaults %+v", got, want)
-	}
-}
--- a/x/models/qwen3_5/qwen3_5.go
+++ b/x/models/qwen3_5/qwen3_5.go
@@ -437,15 +437,15 @@ func freeTensorKeys(tensors map[string]*mlx.Array, keys ...string) {
 	}
 }

-func stackAndDetach(parts []*mlx.Array) *mlx.Array {
+func stackAndClone(parts []*mlx.Array) *mlx.Array {
 	if len(parts) == 0 {
 		return nil
 	}
 	stacked := mlx.Stack(parts, 0)
-	detached := mlx.Detach(stacked)
-	mlx.Eval(detached)
+	cloned := stacked.Clone()
+	mlx.Eval(cloned)
 	mlx.Unpin(stacked)
-	return detached
+	return cloned
 }

 func transposeExpertWeightForGatherMM(w *mlx.Array) *mlx.Array {
@@ -453,10 +453,10 @@ func transposeExpertWeightForGatherMM(w *mlx.Array) *mlx.Array {
 		return w
 	}
 	t := mlx.Transpose(w, 0, 2, 1)
-	d := mlx.Detach(t)
-	mlx.Eval(d)
+	cloned := t.Clone()
+	mlx.Eval(cloned)
 	mlx.Unpin(t)
-	return d
+	return cloned
 }

 func describeMoEProjection(prefix string, w *stackedExpertWeights) string {
@@ -612,12 +612,12 @@ func collectPerExpertProjection(tensors map[string]*mlx.Array, cfg *Config, useQ
 		return nil
 	}

-	out := &stackedExpertWeights{Weight: stackAndDetach(weights), Bits: bits, GroupSize: groupSize, Mode: mode}
+	out := &stackedExpertWeights{Weight: stackAndClone(weights), Bits: bits, GroupSize: groupSize, Mode: mode}
 	if len(scales) == len(weights) {
-		out.Scales = stackAndDetach(scales)
+		out.Scales = stackAndClone(scales)
 	}
 	if len(biases) == len(weights) {
-		out.Biases = stackAndDetach(biases)
+		out.Biases = stackAndClone(biases)
 	}
 	freeTensorKeys(tensors, consumedKeys...)
 	return out
@@ -1073,16 +1073,6 @@ func softplus(x *mlx.Array) *mlx.Array {
 	return mlx.Log(mlx.AddScalar(mlx.Exp(x), 1.0))
 }

-func repeatHeads(x *mlx.Array, repeatFactor int32) *mlx.Array {
-	if repeatFactor <= 1 {
-		return x
-	}
-	shape := x.Dims()
-	x = mlx.ExpandDims(x, 3)
-	x = mlx.Tile(x, []int32{1, 1, 1, repeatFactor, 1})
-	return mlx.Reshape(x, int32(shape[0]), int32(shape[1]), int32(shape[2])*repeatFactor, int32(shape[3]))
-}
-
 func depthwiseCausalConv1d(x, w *mlx.Array, outLen int32) *mlx.Array {
 	if x == nil || w == nil {
 		return nil
@@ -1235,7 +1225,6 @@ func (g *GatedDeltaNet) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Co
 	k = mlx.Reshape(k, B, L, cfg.LinearNumKeyHeads, cfg.LinearKeyHeadDim)
 	v = mlx.Reshape(v, B, L, cfg.LinearNumValueHeads, cfg.LinearValueHeadDim)
 	invScale := float32(1.0 / math.Sqrt(float64(cfg.LinearKeyHeadDim)))
-	repeatFactor := cfg.LinearNumValueHeads / cfg.LinearNumKeyHeads
 	q = mlx.MulScalar(mlx.RMSNormFn(q, nil, 1e-6), invScale*invScale)
 	k = mlx.MulScalar(mlx.RMSNormFn(k, nil, 1e-6), invScale)

@@ -1256,50 +1245,7 @@ func (g *GatedDeltaNet) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Co
 		state = mlx.Zeros(x.DType(), int(B), int(cfg.LinearNumValueHeads), int(cfg.LinearValueHeadDim), int(cfg.LinearKeyHeadDim))
 	}

-	var out *mlx.Array
-	if fusedOut, fusedState, fused := mlx.GatedDeltaKernel(q, k, v, gDecay, beta, state); fused {
-		out = fusedOut
-		state = fusedState
-	} else if L == 1 {
-		if repeatFactor > 1 {
-			q = repeatHeads(q, repeatFactor)
-			k = repeatHeads(k, repeatFactor)
-		}
-		// Fast decode path: avoid per-token slice/append graph construction.
-		qt := mlx.Squeeze(q, 1)
-		kt := mlx.Squeeze(k, 1)
-		vt := mlx.Squeeze(v, 1)
-		gt := mlx.Squeeze(gDecay, 1)
-		bt := mlx.Squeeze(beta, 1)
-
-		state = mlx.Mul(state, mlx.ExpandDims(mlx.ExpandDims(gt, -1), -1))
-		kvMem := mlx.Sum(mlx.Mul(state, mlx.ExpandDims(kt, 2)), -1, false)
-		delta := mlx.Mul(mlx.Sub(vt, kvMem), mlx.ExpandDims(bt, -1))
-		state = mlx.Add(state, mlx.Mul(mlx.ExpandDims(kt, 2), mlx.ExpandDims(delta, -1)))
-		yt := mlx.Sum(mlx.Mul(state, mlx.ExpandDims(qt, 2)), -1, false)
-		out = mlx.ExpandDims(yt, 1)
-	} else {
-		if repeatFactor > 1 {
-			q = repeatHeads(q, repeatFactor)
-			k = repeatHeads(k, repeatFactor)
-		}
-		outs := make([]*mlx.Array, 0, L)
-		for t := int32(0); t < L; t++ {
-			qt := mlx.Squeeze(mlx.SliceStartStop(q, []int32{0, t, 0, 0}, []int32{B, t + 1, cfg.LinearNumValueHeads, cfg.LinearKeyHeadDim}), 1)
-			kt := mlx.Squeeze(mlx.SliceStartStop(k, []int32{0, t, 0, 0}, []int32{B, t + 1, cfg.LinearNumValueHeads, cfg.LinearKeyHeadDim}), 1)
-			vt := mlx.Squeeze(mlx.SliceStartStop(v, []int32{0, t, 0, 0}, []int32{B, t + 1, cfg.LinearNumValueHeads, cfg.LinearValueHeadDim}), 1)
-			gt := mlx.Squeeze(mlx.SliceStartStop(gDecay, []int32{0, t, 0}, []int32{B, t + 1, cfg.LinearNumValueHeads}), 1)
-			bt := mlx.Squeeze(mlx.SliceStartStop(beta, []int32{0, t, 0}, []int32{B, t + 1, cfg.LinearNumValueHeads}), 1)
-
-			state = mlx.Mul(state, mlx.ExpandDims(mlx.ExpandDims(gt, -1), -1))
-			kvMem := mlx.Sum(mlx.Mul(state, mlx.ExpandDims(kt, 2)), -1, false)
-			delta := mlx.Mul(mlx.Sub(vt, kvMem), mlx.ExpandDims(bt, -1))
-			state = mlx.Add(state, mlx.Mul(mlx.ExpandDims(kt, 2), mlx.ExpandDims(delta, -1)))
-			yt := mlx.Sum(mlx.Mul(state, mlx.ExpandDims(qt, 2)), -1, false)
-			outs = append(outs, mlx.ExpandDims(yt, 1))
-		}
-		out = mlx.Concatenate(outs, 1)
-	}
+	out, state := mlx.GatedDelta(q, k, v, gDecay, beta, state)
 	out = mlx.RMSNormFn(out, g.NormWeight, cfg.RMSNormEps)
 	out = mlx.Mul(out, mlx.SiLU(z))
 	out = mlx.Reshape(out, B, L, valueDim)
@@ -1432,6 +1378,10 @@ func (m *Model) NumLayers() int {
 	return len(m.Layers)
 }

+func (m *Model) MaxContextLength() int {
+	return int(m.MaxPositionEmbeddings)
+}
+
 func (m *Model) Tokenizer() *tokenizer.Tokenizer {
 	return m.tok
 }
@@ -1449,9 +1399,3 @@ func (m *Model) NewCaches() []cache.Cache {
 	}
 	return caches
 }
-
-// DisablePromptCache returns false to allow append-only prompt cache reuse.
-// Recurrent caches report CanTrim=false, so divergent prefixes are dropped.
-func (m *Model) DisablePromptCache() bool {
-	return false
-}
--- a/x/models/qwen3_5/qwen3_5_test.go
+++ b/x/models/qwen3_5/qwen3_5_test.go
@@ -126,13 +126,6 @@ func TestResolveTensorPathLayout(t *testing.T) {
 	}
 }

-func TestModelRuntimeDefaults(t *testing.T) {
-	m := &Model{}
-	if m.DisablePromptCache() {
-		t.Fatal("DisablePromptCache() = true, want false")
-	}
-}
-
 func TestNewCachesLayout(t *testing.T) {
 	m := &Model{
 		Config: &Config{
Author	SHA1	Message	Date
Patrick Devine	5ec9c4fe1d	add MaxContextLength()	2026-02-28 16:59:58 -08:00
Patrick Devine	1dd5cbd6c3	address comments	2026-02-28 16:56:54 -08:00
Patrick Devine	59ab14602b	cleanup	2026-02-28 16:56:54 -08:00
Patrick Devine	8be9ce59a5	add qwen3.5	2026-02-28 16:56:54 -08:00
Patrick Devine	1b56e7e7b2	smaller recurrent cache	2026-02-28 16:56:54 -08:00