Compare commits

..

5 Commits

Author SHA1 Message Date
Patrick Devine
5ec9c4fe1d add MaxContextLength() 2026-02-28 16:59:58 -08:00
Patrick Devine
1dd5cbd6c3 address comments 2026-02-28 16:56:54 -08:00
Patrick Devine
59ab14602b cleanup 2026-02-28 16:56:54 -08:00
Patrick Devine
8be9ce59a5 add qwen3.5 2026-02-28 16:56:54 -08:00
Patrick Devine
1b56e7e7b2 smaller recurrent cache 2026-02-28 16:56:54 -08:00
17 changed files with 240 additions and 981 deletions

View File

@@ -1453,12 +1453,10 @@ type ImageData struct {
}
type CompletionRequest struct {
Prompt string
Format json.RawMessage
Images []ImageData
Options *api.Options
Think *api.ThinkValue
ExplicitOptions map[string]struct{}
Prompt string
Format json.RawMessage
Images []ImageData
Options *api.Options
Grammar string // set before sending the request to the subprocess
Shift bool

View File

@@ -130,35 +130,6 @@ func (s *Server) modelOptions(model *Model, requestOpts map[string]any) (api.Opt
return opts, nil
}
func explicitOptions(modelOpts, requestOpts map[string]any) map[string]struct{} {
keys := []string{
"temperature",
"top_p",
"min_p",
"top_k",
"repeat_last_n",
"repeat_penalty",
"presence_penalty",
"frequency_penalty",
}
explicit := make(map[string]struct{}, len(keys))
for _, key := range keys {
if optionSpecified(modelOpts, requestOpts, key) {
explicit[key] = struct{}{}
}
}
return explicit
}
func optionSpecified(modelOpts, requestOpts map[string]any, key string) bool {
if _, ok := requestOpts[key]; ok {
return true
}
_, ok := modelOpts[key]
return ok
}
// scheduleRunner schedules a runner after validating inputs such as capabilities and model options.
// It returns the allocated runner, model instance, and consolidated options if successful and error otherwise.
func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.Capability, requestOpts map[string]any, keepAlive *api.Duration) (llm.LlamaServer, *Model, *api.Options, error) {
@@ -568,16 +539,14 @@ func (s *Server) GenerateHandler(c *gin.Context) {
var sb strings.Builder
defer close(ch)
if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
Prompt: prompt,
Images: images,
Format: req.Format,
Options: opts,
Think: req.Think,
ExplicitOptions: explicitOptions(m.Options, req.Options),
Shift: req.Shift == nil || *req.Shift,
Truncate: req.Truncate == nil || *req.Truncate,
Logprobs: req.Logprobs,
TopLogprobs: req.TopLogprobs,
Prompt: prompt,
Images: images,
Format: req.Format,
Options: opts,
Shift: req.Shift == nil || *req.Shift,
Truncate: req.Truncate == nil || *req.Truncate,
Logprobs: req.Logprobs,
TopLogprobs: req.TopLogprobs,
}, func(cr llm.CompletionResponse) {
res := api.GenerateResponse{
Model: req.Model,
@@ -2329,16 +2298,14 @@ func (s *Server) ChatHandler(c *gin.Context) {
// sets up new context given parent context per request
ctx, cancel := context.WithCancel(c.Request.Context())
err := r.Completion(ctx, llm.CompletionRequest{
Prompt: prompt,
Images: images,
Format: currentFormat,
Options: opts,
Think: req.Think,
ExplicitOptions: explicitOptions(m.Options, req.Options),
Shift: req.Shift == nil || *req.Shift,
Truncate: truncate,
Logprobs: req.Logprobs,
TopLogprobs: req.TopLogprobs,
Prompt: prompt,
Images: images,
Format: currentFormat,
Options: opts,
Shift: req.Shift == nil || *req.Shift,
Truncate: truncate,
Logprobs: req.Logprobs,
TopLogprobs: req.TopLogprobs,
}, func(r llm.CompletionResponse) {
res := api.ChatResponse{
Model: req.Model,

View File

@@ -109,6 +109,8 @@ func (s *cacheSession) close() {
if kv == nil {
continue
}
// Mixed cache types (e.g. recurrent + KV) can transiently report different
// offsets, so use the minimum as the safe reusable token prefix.
if off := kv.Offset(); offset < 0 || off < offset {
offset = off
}

View File

@@ -20,21 +20,19 @@ type RecurrentCache struct {
headKDim int
}
func (c *RecurrentCache) setStateMaterialized(dst **mlx.Array, v *mlx.Array) {
func (c *RecurrentCache) setStateMaterialized(old, v *mlx.Array) *mlx.Array {
if v == nil || !v.Valid() {
return
return old
}
if *dst == v {
return
if old == v {
return old
}
// Break dependency chains so recurrent state does not retain the full
// per-token compute graph over time.
snap := mlx.Snapshot(v)
snap := mlx.Copy(v)
mlx.Eval(snap)
old := *dst
*dst = snap
mlx.Pin(snap)
// Drop references to the previous cached state root and transient incoming
@@ -46,40 +44,40 @@ func (c *RecurrentCache) setStateMaterialized(dst **mlx.Array, v *mlx.Array) {
if v != snap && v != old {
mlx.Unpin(v)
}
return snap
}
func (c *RecurrentCache) setStateRaw(dst **mlx.Array, v *mlx.Array) {
func (c *RecurrentCache) setStateRaw(old, v *mlx.Array) *mlx.Array {
if v == nil || !v.Valid() {
return
return old
}
if *dst == v {
return
if old == v {
return old
}
old := *dst
*dst = v
mlx.Pin(v)
if old != nil && old != v {
mlx.Unpin(old)
}
return v
}
func (c *RecurrentCache) setStateDetached(dst **mlx.Array, v *mlx.Array, ensureContiguous bool) {
func (c *RecurrentCache) setStateDetached(old, v *mlx.Array, ensureContiguous bool) *mlx.Array {
if v == nil || !v.Valid() {
return
return old
}
if *dst == v {
return
if old == v {
return old
}
root := v
if ensureContiguous {
root = mlx.Contiguous(v, false)
}
detached := mlx.Detach(root)
detached := root.Clone()
old := *dst
*dst = detached
mlx.Pin(detached)
if old != nil && old != detached {
mlx.Unpin(old)
@@ -88,13 +86,15 @@ func (c *RecurrentCache) setStateDetached(dst **mlx.Array, v *mlx.Array, ensureC
// Intentionally do not force-release root/v here. In the fast path, the detached
// handle aliases the same MLX value and may still be lazily computed. Releasing the
// source handles can invalidate the cached state before the next eval/sweep point.
return detached
}
func snapshotPinned(a *mlx.Array) *mlx.Array {
if a == nil || !a.Valid() {
return nil
}
snap := mlx.Snapshot(a)
snap := mlx.Copy(a)
mlx.Eval(snap)
mlx.Pin(snap)
return snap
@@ -124,10 +124,10 @@ func (c *RecurrentCache) ensure(batch int, dtype mlx.DType) {
}
if needConv {
c.setStateRaw(&c.convState, mlx.Zeros(dtype, batch, c.convTail, c.convDim))
c.convState = c.setStateRaw(c.convState, mlx.Zeros(dtype, batch, c.convTail, c.convDim))
}
if needDelta {
c.setStateRaw(&c.deltaState, mlx.Zeros(dtype, batch, c.numVHeads, c.headVDim, c.headKDim))
c.deltaState = c.setStateRaw(c.deltaState, mlx.Zeros(dtype, batch, c.numVHeads, c.headVDim, c.headKDim))
}
}
@@ -137,7 +137,7 @@ func (c *RecurrentCache) ConvState(batch int, dtype mlx.DType) *mlx.Array {
}
func (c *RecurrentCache) SetConvState(v *mlx.Array) {
c.setStateMaterialized(&c.convState, v)
c.convState = c.setStateMaterialized(c.convState, v)
}
// SetConvStateFast stores conv state without forcing an immediate snapshot/eval.
@@ -145,7 +145,7 @@ func (c *RecurrentCache) SetConvState(v *mlx.Array) {
// sync/sweep point. The conv-state input is usually a slice view, so request a
// compact contiguous copy to avoid pinning the whole source buffer.
func (c *RecurrentCache) SetConvStateFast(v *mlx.Array) {
c.setStateDetached(&c.convState, v, true)
c.convState = c.setStateDetached(c.convState, v, true)
}
func (c *RecurrentCache) DeltaState(batch int, dtype mlx.DType) *mlx.Array {
@@ -154,14 +154,14 @@ func (c *RecurrentCache) DeltaState(batch int, dtype mlx.DType) *mlx.Array {
}
func (c *RecurrentCache) SetDeltaState(v *mlx.Array) {
c.setStateMaterialized(&c.deltaState, v)
c.deltaState = c.setStateMaterialized(c.deltaState, v)
}
// SetDeltaStateFast stores delta state without forcing an immediate snapshot/eval.
// Use only for decode hot paths that accept higher transient memory until the next
// sync/sweep point.
func (c *RecurrentCache) SetDeltaStateFast(v *mlx.Array) {
c.setStateDetached(&c.deltaState, v, false)
c.deltaState = c.setStateDetached(c.deltaState, v, false)
}
func (c *RecurrentCache) Advance(n int) {

View File

@@ -182,20 +182,15 @@ func (c *Client) waitUntilRunning() error {
// completionRequest is a properly-tagged version of llm.CompletionRequest for JSON serialization.
type completionRequest struct {
Prompt string `json:"prompt"`
Think *bool `json:"think,omitempty"`
Options *completionOpts `json:"options,omitempty"`
}
type completionOpts struct {
Temperature *float32 `json:"temperature,omitempty"`
TopP *float32 `json:"top_p,omitempty"`
MinP *float32 `json:"min_p,omitempty"`
TopK *int `json:"top_k,omitempty"`
RepeatLastN *int `json:"repeat_last_n,omitempty"`
RepeatPenalty *float32 `json:"repeat_penalty,omitempty"`
PresencePenalty *float32 `json:"presence_penalty,omitempty"`
FrequencyPenalty *float32 `json:"frequency_penalty,omitempty"`
NumPredict int `json:"num_predict,omitempty"`
Temperature float32 `json:"temperature,omitempty"`
TopP float32 `json:"top_p,omitempty"`
MinP float32 `json:"min_p,omitempty"`
TopK int `json:"top_k,omitempty"`
NumPredict int `json:"num_predict,omitempty"`
}
type CompletionResponse struct {
@@ -233,27 +228,16 @@ func (c *Client) Close() error {
// Completion implements llm.LlamaServer.
func (c *Client) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
var think *bool
if req.Think != nil {
enabled := req.Think.Bool()
think = &enabled
}
creq := completionRequest{
Prompt: req.Prompt,
Think: think,
}
if req.Options != nil {
creq.Options = &completionOpts{
Temperature: float32Ptr(req.Options.Temperature, hasExplicitOption(req.ExplicitOptions, "temperature")),
TopP: float32Ptr(req.Options.TopP, hasExplicitOption(req.ExplicitOptions, "top_p")),
MinP: float32Ptr(req.Options.MinP, hasExplicitOption(req.ExplicitOptions, "min_p")),
TopK: intPtr(req.Options.TopK, hasExplicitOption(req.ExplicitOptions, "top_k")),
RepeatLastN: intPtr(req.Options.RepeatLastN, hasExplicitOption(req.ExplicitOptions, "repeat_last_n")),
RepeatPenalty: float32Ptr(req.Options.RepeatPenalty, hasExplicitOption(req.ExplicitOptions, "repeat_penalty")),
PresencePenalty: float32Ptr(req.Options.PresencePenalty, hasExplicitOption(req.ExplicitOptions, "presence_penalty")),
FrequencyPenalty: float32Ptr(req.Options.FrequencyPenalty, hasExplicitOption(req.ExplicitOptions, "frequency_penalty")),
NumPredict: req.Options.NumPredict,
Temperature: req.Options.Temperature,
TopP: req.Options.TopP,
MinP: req.Options.MinP,
TopK: req.Options.TopK,
NumPredict: req.Options.NumPredict,
}
}
@@ -312,25 +296,6 @@ func (c *Client) Completion(ctx context.Context, req llm.CompletionRequest, fn f
return scanner.Err()
}
func hasExplicitOption(explicit map[string]struct{}, key string) bool {
_, ok := explicit[key]
return ok
}
func float32Ptr(v float32, ok bool) *float32 {
if !ok {
return nil
}
return &v
}
func intPtr(v int, ok bool) *int {
if !ok {
return nil
}
return &v
}
func (c *Client) ContextLength() int {
return int(c.contextLength.Load())
}

View File

@@ -1,167 +0,0 @@
package mlxrunner
import (
"context"
"encoding/json"
"io"
"net/http"
"strings"
"testing"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm"
)
func TestCompletionForwardsThink(t *testing.T) {
boolPtr := func(v bool) *bool { return &v }
testCases := []struct {
name string
think *api.ThinkValue
want *bool
}{
{name: "unset", think: nil, want: nil},
{name: "enabled", think: &api.ThinkValue{Value: true}, want: boolPtr(true)},
{name: "disabled", think: &api.ThinkValue{Value: false}, want: boolPtr(false)},
{name: "level maps to enabled", think: &api.ThinkValue{Value: "high"}, want: boolPtr(true)},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
var got completionRequest
rt := roundTripFunc(func(r *http.Request) (*http.Response, error) {
if r.URL.Path != "/completion" {
t.Fatalf("request path = %q, want %q", r.URL.Path, "/completion")
}
if err := json.NewDecoder(r.Body).Decode(&got); err != nil {
return nil, err
}
return &http.Response{
StatusCode: http.StatusOK,
Header: make(http.Header),
Body: io.NopCloser(strings.NewReader("{\"done\":true}\n")),
Request: r,
}, nil
})
c := &Client{
port: 11434,
client: &http.Client{
Transport: rt,
},
}
err := c.Completion(context.Background(), llm.CompletionRequest{
Prompt: "hello",
Think: tc.think,
}, func(llm.CompletionResponse) {})
if err != nil {
t.Fatalf("completion request failed: %v", err)
}
if got.Prompt != "hello" {
t.Fatalf("prompt = %q, want %q", got.Prompt, "hello")
}
switch {
case tc.want == nil && got.Think != nil:
t.Fatalf("think = %v, want nil", *got.Think)
case tc.want != nil && got.Think == nil:
t.Fatalf("think = nil, want %v", *tc.want)
case tc.want != nil && got.Think != nil && *tc.want != *got.Think:
t.Fatalf("think = %v, want %v", *got.Think, *tc.want)
}
})
}
}
func TestCompletionForwardsOnlySpecifiedSamplingOptions(t *testing.T) {
var got completionRequest
rt := roundTripFunc(func(r *http.Request) (*http.Response, error) {
if err := json.NewDecoder(r.Body).Decode(&got); err != nil {
return nil, err
}
return &http.Response{
StatusCode: http.StatusOK,
Header: make(http.Header),
Body: io.NopCloser(strings.NewReader("{\"done\":true}\n")),
Request: r,
}, nil
})
c := &Client{
port: 11434,
client: &http.Client{
Transport: rt,
},
}
opts := &api.Options{
Temperature: 1.0,
TopP: 0.95,
MinP: 0.1,
TopK: 20,
RepeatLastN: 128,
RepeatPenalty: 1.2,
PresencePenalty: 1.5,
FrequencyPenalty: 0.25,
NumPredict: 64,
}
err := c.Completion(context.Background(), llm.CompletionRequest{
Prompt: "hello",
Options: opts,
ExplicitOptions: map[string]struct{}{
"temperature": {},
"top_k": {},
"repeat_penalty": {},
"presence_penalty": {},
},
}, func(llm.CompletionResponse) {})
if err != nil {
t.Fatalf("completion request failed: %v", err)
}
if got.Options == nil {
t.Fatal("options = nil, want serialized options")
}
if got.Options.Temperature == nil || *got.Options.Temperature != opts.Temperature {
t.Fatalf("temperature = %v, want %v", got.Options.Temperature, opts.Temperature)
}
if got.Options.TopK == nil || *got.Options.TopK != opts.TopK {
t.Fatalf("top_k = %v, want %v", got.Options.TopK, opts.TopK)
}
if got.Options.RepeatPenalty == nil || *got.Options.RepeatPenalty != opts.RepeatPenalty {
t.Fatalf("repeat_penalty = %v, want %v", got.Options.RepeatPenalty, opts.RepeatPenalty)
}
if got.Options.PresencePenalty == nil || *got.Options.PresencePenalty != opts.PresencePenalty {
t.Fatalf("presence_penalty = %v, want %v", got.Options.PresencePenalty, opts.PresencePenalty)
}
if got.Options.TopP != nil {
t.Fatalf("top_p = %v, want nil", *got.Options.TopP)
}
if got.Options.MinP != nil {
t.Fatalf("min_p = %v, want nil", *got.Options.MinP)
}
if got.Options.RepeatLastN != nil {
t.Fatalf("repeat_last_n = %v, want nil", *got.Options.RepeatLastN)
}
if got.Options.FrequencyPenalty != nil {
t.Fatalf("frequency_penalty = %v, want nil", *got.Options.FrequencyPenalty)
}
if got.Options.NumPredict != opts.NumPredict {
t.Fatalf("num_predict = %d, want %d", got.Options.NumPredict, opts.NumPredict)
}
}
type roundTripFunc func(*http.Request) (*http.Response, error)
func (f roundTripFunc) RoundTrip(r *http.Request) (*http.Response, error) {
return f(r)
}

View File

@@ -8,14 +8,13 @@ import "C"
import (
"sync"
"sync/atomic"
"unsafe"
)
var (
gatedDeltaMetalKernelOnce sync.Once
gatedDeltaMetalKernel C.mlx_fast_metal_kernel
gatedDeltaMetalDisabled atomic.Bool
gatedDeltaMetalDisabled bool
)
const gatedDeltaMetalKernelSource = `
@@ -108,7 +107,7 @@ func cStringVector(values []string) (C.mlx_vector_string, func(), bool) {
func initGatedDeltaMetalKernel() {
inputs, freeInputs, ok := cStringVector([]string{"q", "k", "v", "g", "beta", "state_in", "T"})
if !ok {
gatedDeltaMetalDisabled.Store(true)
gatedDeltaMetalDisabled = true
freeInputs()
return
}
@@ -116,7 +115,7 @@ func initGatedDeltaMetalKernel() {
outputs, freeOutputs, ok := cStringVector([]string{"y", "state_out"})
if !ok {
gatedDeltaMetalDisabled.Store(true)
gatedDeltaMetalDisabled = true
freeOutputs()
return
}
@@ -143,7 +142,7 @@ func initGatedDeltaMetalKernel() {
// GatedDeltaKernel runs a fused Metal kernel for the qwen3.5 recurrent update.
// It returns ok=false on unsupported shapes/devices or kernel setup/apply failure.
func GatedDeltaKernel(q, k, v, g, beta, state *Array) (y, nextState *Array, ok bool) {
if gatedDeltaMetalDisabled.Load() {
if gatedDeltaMetalDisabled {
return nil, nil, false
}
if q == nil || k == nil || v == nil || g == nil || beta == nil || state == nil {
@@ -190,7 +189,7 @@ func GatedDeltaKernel(q, k, v, g, beta, state *Array) (y, nextState *Array, ok b
}
gatedDeltaMetalKernelOnce.Do(initGatedDeltaMetalKernel)
if gatedDeltaMetalDisabled.Load() {
if gatedDeltaMetalDisabled {
return nil, nil, false
}
@@ -200,7 +199,7 @@ func GatedDeltaKernel(q, k, v, g, beta, state *Array) (y, nextState *Array, ok b
cInT := C.CString("InT")
defer C.free(unsafe.Pointer(cInT))
if C.mlx_fast_metal_kernel_config_add_template_arg_dtype(cfg, cInT, C.mlx_dtype(dtype)) != 0 {
gatedDeltaMetalDisabled.Store(true)
gatedDeltaMetalDisabled = true
return nil, nil, false
}
for _, tpl := range []struct {
@@ -216,7 +215,7 @@ func GatedDeltaKernel(q, k, v, g, beta, state *Array) (y, nextState *Array, ok b
rc := C.mlx_fast_metal_kernel_config_add_template_arg_int(cfg, cn, C.int(tpl.value))
C.free(unsafe.Pointer(cn))
if rc != 0 {
gatedDeltaMetalDisabled.Store(true)
gatedDeltaMetalDisabled = true
return nil, nil, false
}
}
@@ -224,15 +223,15 @@ func GatedDeltaKernel(q, k, v, g, beta, state *Array) (y, nextState *Array, ok b
yShape := []C.int{C.int(B), C.int(T), C.int(Hv), C.int(Dv)}
stateShape := []C.int{C.int(B), C.int(Hv), C.int(Dv), C.int(Dk)}
if C.mlx_fast_metal_kernel_config_add_output_arg(cfg, unsafe.SliceData(yShape), C.size_t(len(yShape)), C.mlx_dtype(dtype)) != 0 {
gatedDeltaMetalDisabled.Store(true)
gatedDeltaMetalDisabled = true
return nil, nil, false
}
if C.mlx_fast_metal_kernel_config_add_output_arg(cfg, unsafe.SliceData(stateShape), C.size_t(len(stateShape)), C.mlx_dtype(dtype)) != 0 {
gatedDeltaMetalDisabled.Store(true)
gatedDeltaMetalDisabled = true
return nil, nil, false
}
if C.mlx_fast_metal_kernel_config_set_grid(cfg, 32, C.int(Dv), C.int(B*Hv)) != 0 {
gatedDeltaMetalDisabled.Store(true)
gatedDeltaMetalDisabled = true
return nil, nil, false
}
threadY := Dv
@@ -240,7 +239,7 @@ func GatedDeltaKernel(q, k, v, g, beta, state *Array) (y, nextState *Array, ok b
threadY = 4
}
if C.mlx_fast_metal_kernel_config_set_thread_group(cfg, 32, C.int(threadY), 1) != 0 {
gatedDeltaMetalDisabled.Store(true)
gatedDeltaMetalDisabled = true
return nil, nil, false
}
@@ -260,7 +259,7 @@ func GatedDeltaKernel(q, k, v, g, beta, state *Array) (y, nextState *Array, ok b
outVec := C.mlx_vector_array_new()
defer C.mlx_vector_array_free(outVec)
if C.mlx_fast_metal_kernel_apply(&outVec, gatedDeltaMetalKernel, inVec, cfg, DefaultStream().ctx) != 0 {
gatedDeltaMetalDisabled.Store(true)
gatedDeltaMetalDisabled = true
return nil, nil, false
}
if int(C.mlx_vector_array_size(outVec)) < 2 {
@@ -273,3 +272,101 @@ func GatedDeltaKernel(q, k, v, g, beta, state *Array) (y, nextState *Array, ok b
C.mlx_vector_array_get(&nextState.ctx, outVec, 1)
return y, nextState, true
}
func repeatHeadsForGatedDelta(x *Array, repeatFactor int) *Array {
if repeatFactor <= 1 {
return x
}
shape := x.Dims()
x = ExpandDims(x, 3)
x = Tile(x, []int32{1, 1, 1, int32(repeatFactor), 1})
return Reshape(x, int32(shape[0]), int32(shape[1]), int32(shape[2]*repeatFactor), int32(shape[3]))
}
func gatedDeltaFallback(q, k, v, g, beta, state *Array) (y, nextState *Array) {
if q == nil || k == nil || v == nil || g == nil || beta == nil || state == nil {
return nil, nil
}
if !q.Valid() || !k.Valid() || !v.Valid() || !g.Valid() || !beta.Valid() || !state.Valid() {
return nil, nil
}
qd := q.Dims()
kd := k.Dims()
vd := v.Dims()
gd := g.Dims()
bd := beta.Dims()
sd := state.Dims()
if len(qd) != 4 || len(kd) != 4 || len(vd) != 4 || len(gd) != 3 || len(bd) != 3 || len(sd) != 4 {
return nil, nil
}
B, T, Hk, Dk := int32(qd[0]), int32(qd[1]), int32(qd[2]), int32(qd[3])
Hv, Dv := int32(vd[2]), int32(vd[3])
if T <= 0 || Hk <= 0 || Dk <= 0 || Hv <= 0 || Dv <= 0 || Hv%Hk != 0 {
return nil, nil
}
if kd[0] != int(B) || kd[1] != int(T) || kd[2] != int(Hk) || kd[3] != int(Dk) {
return nil, nil
}
if vd[0] != int(B) || vd[1] != int(T) {
return nil, nil
}
if gd[0] != int(B) || gd[1] != int(T) || gd[2] != int(Hv) {
return nil, nil
}
if bd[0] != int(B) || bd[1] != int(T) || bd[2] != int(Hv) {
return nil, nil
}
if sd[0] != int(B) || sd[1] != int(Hv) || sd[2] != int(Dv) || sd[3] != int(Dk) {
return nil, nil
}
repeatFactor := int(Hv / Hk)
q = repeatHeadsForGatedDelta(q, repeatFactor)
k = repeatHeadsForGatedDelta(k, repeatFactor)
nextState = state
if T == 1 {
qt := Squeeze(q, 1)
kt := Squeeze(k, 1)
vt := Squeeze(v, 1)
gt := Squeeze(g, 1)
bt := Squeeze(beta, 1)
nextState = Mul(nextState, ExpandDims(ExpandDims(gt, -1), -1))
kvMem := Sum(Mul(nextState, ExpandDims(kt, 2)), -1, false)
delta := Mul(Sub(vt, kvMem), ExpandDims(bt, -1))
nextState = Add(nextState, Mul(ExpandDims(kt, 2), ExpandDims(delta, -1)))
yt := Sum(Mul(nextState, ExpandDims(qt, 2)), -1, false)
return ExpandDims(yt, 1), nextState
}
outs := make([]*Array, 0, T)
for t := int32(0); t < T; t++ {
qt := Squeeze(SliceStartStop(q, []int32{0, t, 0, 0}, []int32{B, t + 1, Hv, Dk}), 1)
kt := Squeeze(SliceStartStop(k, []int32{0, t, 0, 0}, []int32{B, t + 1, Hv, Dk}), 1)
vt := Squeeze(SliceStartStop(v, []int32{0, t, 0, 0}, []int32{B, t + 1, Hv, Dv}), 1)
gt := Squeeze(SliceStartStop(g, []int32{0, t, 0}, []int32{B, t + 1, Hv}), 1)
bt := Squeeze(SliceStartStop(beta, []int32{0, t, 0}, []int32{B, t + 1, Hv}), 1)
nextState = Mul(nextState, ExpandDims(ExpandDims(gt, -1), -1))
kvMem := Sum(Mul(nextState, ExpandDims(kt, 2)), -1, false)
delta := Mul(Sub(vt, kvMem), ExpandDims(bt, -1))
nextState = Add(nextState, Mul(ExpandDims(kt, 2), ExpandDims(delta, -1)))
yt := Sum(Mul(nextState, ExpandDims(qt, 2)), -1, false)
outs = append(outs, ExpandDims(yt, 1))
}
return Concatenate(outs, 1), nextState
}
// GatedDelta runs the recurrent update operation.
//
// It uses the fused Metal kernel when available and otherwise falls back to a
// backend-agnostic MLX implementation with identical inputs/outputs.
func GatedDelta(q, k, v, g, beta, state *Array) (y, nextState *Array) {
if y, nextState, ok := GatedDeltaKernel(q, k, v, g, beta, state); ok {
return y, nextState
}
return gatedDeltaFallback(q, k, v, g, beta, state)
}

View File

@@ -93,12 +93,6 @@ func (t *Array) Divide(other *Array) *Array {
return out
}
func (t *Array) Cumsum(axis int, reverse, inclusive bool) *Array {
out := New("CUMSUM")
C.mlx_cumsum(&out.ctx, t.ctx, C.int(axis), C.bool(reverse), C.bool(inclusive), DefaultStream().ctx)
return out
}
func (t *Array) ExpandDims(axis int) *Array {
out := New("EXPAND_DIMS")
C.mlx_expand_dims(&out.ctx, t.ctx, C.int(axis), DefaultStream().ctx)
@@ -129,30 +123,12 @@ func (t *Array) GatherMM(other, lhs, rhs *Array, sorted bool) *Array {
return out
}
func (t *Array) GreaterEqual(other *Array) *Array {
out := New("GREATER_EQUAL")
C.mlx_greater_equal(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
return out
}
func (t *Array) Logsumexp(keepDims bool) *Array {
out := New("LOGSUMEXP")
C.mlx_logsumexp(&out.ctx, t.ctx, C.bool(keepDims), DefaultStream().ctx)
return out
}
func (t *Array) Less(other *Array) *Array {
out := New("LESS")
C.mlx_less(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
return out
}
func (t *Array) LogicalOr(other *Array) *Array {
out := New("LOGICAL_OR")
C.mlx_logical_or(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
return out
}
func (t *Array) Matmul(other *Array) *Array {
out := New("MATMUL")
C.mlx_matmul(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)

View File

@@ -429,27 +429,16 @@ func Collect(v any) []*Array {
return arrays
}
// Snapshot copies an array into a fresh leaf value with no Go-side graph inputs.
func Snapshot(a *Array) *Array {
// Copy copies an array into a fresh leaf value with no Go-side graph inputs.
func Copy(a *Array) *Array {
if a == nil || !a.Valid() {
return a
}
out := New("SNAPSHOT")
out := New("COPY")
C.mlx_copy(&out.ctx, a.ctx, DefaultStream().ctx)
return out
}
// Detach returns a new Array handle that shares the same MLX value but does
// not retain Go-side graph input references.
func Detach(a *Array) *Array {
if a == nil || !a.Valid() {
return a
}
out := New("DETACH")
C.mlx_array_set(&out.ctx, a.ctx)
return out
}
func collect(v reflect.Value, arrays *[]*Array, seen map[uintptr]bool) {
if !v.IsValid() {
return

View File

@@ -26,10 +26,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
}
ctx := request.Ctx
if ctx == nil {
ctx = context.Background()
}
var (
sample, logprobs *mlx.Array
nextSample, nextLogprobs *mlx.Array
@@ -82,7 +78,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
defer session.close()
caches := session.caches
tokens := session.remaining
history := append([]int32(nil), session.inputs...)
prefillChunk := prefillChunkSize()
materializeCaches := func() {
@@ -115,13 +110,13 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
mlx.ClearCache()
}
step := func(token *mlx.Array, history []int32) (*mlx.Array, *mlx.Array) {
step := func(token *mlx.Array) (*mlx.Array, *mlx.Array) {
fwd := r.Model.Forward(token.ExpandDims(0), caches)
logits := r.Model.Unembed(fwd)
logits = logits.Slice(mlx.Slice(), mlx.Slice(logits.Dim(1)-1), mlx.Slice()).Squeeze(1)
logprobs := logits.Subtract(logits.Logsumexp(true))
sample := request.Sample(logprobs, history)
sample := request.Sample(logprobs)
mlx.Pin(sample, logprobs)
mlx.Sweep()
@@ -130,7 +125,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
return sample, logprobs
}
sample, logprobs = step(mlx.FromValues(tokens[processed:], total-processed), history)
sample, logprobs = step(mlx.FromValues(tokens[processed:], total-processed))
var b bytes.Buffer
@@ -140,6 +135,8 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
return err
}
nextSample, nextLogprobs = step(sample)
if i == 0 {
mlx.Eval(sample)
final.PromptEvalDuration = time.Since(now)
@@ -148,7 +145,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
output := int32(sample.Int())
session.outputs = append(session.outputs, output)
history = append(history, output)
if r.Tokenizer.IsEOS(output) {
final.DoneReason = 0
@@ -157,15 +153,13 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
}
select {
case <-request.Ctx.Done():
return request.Ctx.Err()
case <-ctx.Done():
return ctx.Err()
case request.Responses <- CompletionResponse{
Content: r.Decode(output, &b),
}:
}
nextSample, nextLogprobs = step(sample, history)
mlx.Unpin(sample, logprobs)
sample, logprobs = nextSample, nextLogprobs
nextSample, nextLogprobs = nil, nil

View File

@@ -32,17 +32,12 @@ type Request struct {
type TextCompletionsRequest struct {
Prompt string `json:"prompt"`
Think *bool `json:"think,omitempty"`
Options struct {
Temperature *float32 `json:"temperature"`
TopP *float32 `json:"top_p"`
MinP *float32 `json:"min_p"`
TopK *int `json:"top_k"`
RepeatLastN *int `json:"repeat_last_n"`
RepeatPenalty *float32 `json:"repeat_penalty"`
PresencePenalty *float32 `json:"presence_penalty"`
FrequencyPenalty *float32 `json:"frequency_penalty"`
MaxTokens int `json:"max_tokens"`
Temperature float32 `json:"temperature"`
TopP float32 `json:"top_p"`
MinP float32 `json:"min_p"`
TopK int `json:"top_k"`
MaxTokens int `json:"max_tokens"`
// Deprecated: use MaxTokens instead
NumPredict int `json:"num_predict"`

View File

@@ -9,204 +9,69 @@ import (
)
type Sampler interface {
Sample(*mlx.Array, []int32) *mlx.Array
Sample(*mlx.Array) *mlx.Array
}
func New(temp, top_p, min_p float32, top_k, repeatLastN int, repeatPenalty, presencePenalty, frequencyPenalty float32) Sampler {
var samplers []Sampler
if repeatLastN > 0 && (repeatPenalty != 1 || presencePenalty != 0 || frequencyPenalty != 0) {
samplers = append(samplers, Penalty{
RepeatLastN: repeatLastN,
RepeatPenalty: repeatPenalty,
PresencePenalty: presencePenalty,
FrequencyPenalty: frequencyPenalty,
})
func New(temp, top_p, min_p float32, top_k int) Sampler {
if temp == 0 {
return greedy{}
}
if temp == 0 {
samplers = append(samplers, greedy{})
} else {
samplers = append(samplers, Distribution{
Temperature: temp,
TopK: top_k,
TopP: top_p,
MinP: min_p,
})
var samplers []Sampler
if top_p > 0 && top_p < 1 {
samplers = append(samplers, TopP(top_p))
}
if min_p != 0 {
samplers = append(samplers, MinP(min_p))
}
if top_k > 0 {
samplers = append(samplers, TopK(top_k))
}
samplers = append(samplers, Temperature(temp))
return chain(samplers)
}
type greedy struct{}
func (greedy) Sample(logits *mlx.Array, _ []int32) *mlx.Array {
func (greedy) Sample(logits *mlx.Array) *mlx.Array {
return logits.Argmax(-1, false)
}
type chain []Sampler
func (c chain) Sample(logits *mlx.Array, history []int32) *mlx.Array {
func (c chain) Sample(logits *mlx.Array) *mlx.Array {
for _, sampler := range c {
logits = sampler.Sample(logits, history)
logits = sampler.Sample(logits)
}
return logits
}
type Distribution struct {
Temperature float32
TopK int
TopP float32
MinP float32
type Temperature float32
func (t Temperature) Sample(logits *mlx.Array) *mlx.Array {
return mlx.DivScalar(logits, float32(t)).Categorical(-1)
}
func (d Distribution) Sample(logits *mlx.Array, _ []int32) *mlx.Array {
filtered, indices := d.filter(logits)
sample := filtered.Categorical(-1)
if indices == nil {
return sample
}
type TopP float32
positions := sample.ExpandDims(1)
return indices.TakeAlongAxis(positions, -1).Squeeze(1)
func (p TopP) Sample(logprobs *mlx.Array) *mlx.Array {
// TODO: implement
return logprobs
}
func (d Distribution) filter(logits *mlx.Array) (*mlx.Array, *mlx.Array) {
candidates := logits
var candidateIndices *mlx.Array
type MinP float32
if d.TopK > 0 && d.TopK < logits.Dim(logits.NumDims()-1) {
partitions := logits.Negative().ArgpartitionAxis(d.TopK-1, -1)
switch logits.NumDims() {
case 1:
candidateIndices = partitions.Slice(mlx.Slice(0, d.TopK))
default:
candidateIndices = partitions.Slice(mlx.Slice(), mlx.Slice(0, d.TopK))
}
candidates = logits.TakeAlongAxis(candidateIndices, -1)
}
if d.Temperature != 1 {
candidates = mlx.DivScalar(candidates, d.Temperature)
}
if !d.needsProbabilityFilters() {
return candidates, candidateIndices
}
order := candidates.Negative().ArgsortAxis(-1)
sortedLogits := candidates.TakeAlongAxis(order, -1)
sortedProbs := mlx.SoftmaxAxis(candidates, -1, true).TakeAlongAxis(order, -1)
remove := d.topPRemovalMask(sortedProbs)
if d.MinP > 0 {
minPRemove := d.minPRemovalMask(sortedProbs)
if remove == nil {
remove = minPRemove
} else {
remove = remove.LogicalOr(minPRemove)
}
}
if remove == nil {
return candidates, candidateIndices
}
negInf := mlx.FromValue(float32(math.Inf(-1)))
filtered := mlx.Where(remove, negInf, sortedLogits)
return candidates.PutAlongAxis(order, filtered, -1), candidateIndices
func (p MinP) Sample(logprobs *mlx.Array) *mlx.Array {
// TODO: implement
return logprobs
}
func (d Distribution) needsProbabilityFilters() bool {
return (d.TopP > 0 && d.TopP < 1) || d.MinP > 0
}
func (d Distribution) topPRemovalMask(sortedProbs *mlx.Array) *mlx.Array {
if d.TopP <= 0 || d.TopP >= 1 {
return nil
}
threshold := mlx.NewScalarArray(d.TopP)
prevCum := sortedProbs.Cumsum(-1, false, true).Subtract(sortedProbs)
return prevCum.GreaterEqual(threshold)
}
func (d Distribution) minPRemovalMask(sortedProbs *mlx.Array) *mlx.Array {
if d.MinP <= 0 {
return nil
}
var maxProb *mlx.Array
switch sortedProbs.NumDims() {
case 1:
maxProb = sortedProbs.Slice(mlx.Slice(0, 1))
default:
maxProb = sortedProbs.Slice(mlx.Slice(), mlx.Slice(0, 1))
}
threshold := mlx.MulScalar(maxProb, d.MinP)
return sortedProbs.Less(threshold)
}
type Penalty struct {
RepeatLastN int
RepeatPenalty float32
PresencePenalty float32
FrequencyPenalty float32
}
func (p Penalty) Sample(logprobs *mlx.Array, history []int32) *mlx.Array {
if len(history) == 0 {
return logprobs
}
window := p.RepeatLastN
if window <= 0 || window > len(history) {
window = len(history)
}
counts := make(map[int32]int, window)
order := make([]int32, 0, window)
for _, token := range history[len(history)-window:] {
if token < 0 {
continue
}
if counts[token] == 0 {
order = append(order, token)
}
counts[token]++
}
if len(order) == 0 {
return logprobs
}
indexShape := []int32{int32(len(order))}
valueShape := []int{len(order)}
if logprobs.NumDims() > 1 {
indexShape = []int32{1, int32(len(order))}
valueShape = []int{1, len(order)}
}
indices := mlx.NewArrayInt32(order, indexShape)
selected := logprobs.TakeAlongAxis(indices, -1)
mlx.Eval(selected)
values := selected.Floats()
for i, token := range order {
v := values[i]
if p.RepeatPenalty != 1 {
if v < 0 {
v *= p.RepeatPenalty
} else {
v /= p.RepeatPenalty
}
}
if p.PresencePenalty != 0 {
v -= p.PresencePenalty
}
if p.FrequencyPenalty != 0 {
v -= p.FrequencyPenalty * float32(counts[token])
}
values[i] = v
}
return logprobs.PutAlongAxis(indices, mlx.FromValues(values, valueShape...), -1)
type TopK int
func (k TopK) Sample(logprobs *mlx.Array) *mlx.Array {
mask := logprobs.Negative().ArgpartitionAxis(int(k)-1, -1).Slice(mlx.Slice(), mlx.Slice(int(k), 0))
return logprobs.PutAlongAxis(mask, mlx.FromValue(float32(math.Inf(-1))), -1)
}

View File

@@ -1,104 +0,0 @@
//go:build mlx
package sample
import (
"math"
"testing"
"github.com/ollama/ollama/x/mlxrunner/mlx"
)
func TestPenaltySample(t *testing.T) {
if err := mlx.CheckInit(); err != nil {
t.Skipf("MLX not available: %v", err)
}
logprobs := mlx.FromValues([]float32{
1.0, -2.0, 3.0, 4.0,
}, 1, 4)
got := Penalty{
RepeatLastN: 3,
RepeatPenalty: 2.0,
PresencePenalty: 1.5,
FrequencyPenalty: 0.25,
}.Sample(logprobs, []int32{2, 1, 2})
mlx.Eval(got)
want := []float32{1.0, -5.75, -0.5, 4.0}
values := got.Floats()
if len(values) != len(want) {
t.Fatalf("len(values) = %d, want %d", len(values), len(want))
}
for i := range want {
if math.Abs(float64(values[i]-want[i])) > 1e-5 {
t.Fatalf("values[%d] = %v, want %v", i, values[i], want[i])
}
}
}
func TestPenaltySampleHonorsRepeatWindow(t *testing.T) {
if err := mlx.CheckInit(); err != nil {
t.Skipf("MLX not available: %v", err)
}
logprobs := mlx.FromValues([]float32{
1.0, 2.0, 3.0,
}, 1, 3)
got := Penalty{
RepeatLastN: 1,
PresencePenalty: 1.0,
}.Sample(logprobs, []int32{0, 1})
mlx.Eval(got)
want := []float32{1.0, 1.0, 3.0}
values := got.Floats()
for i := range want {
if math.Abs(float64(values[i]-want[i])) > 1e-5 {
t.Fatalf("values[%d] = %v, want %v", i, values[i], want[i])
}
}
}
func TestDistributionFilterTopP(t *testing.T) {
if err := mlx.CheckInit(); err != nil {
t.Skipf("MLX not available: %v", err)
}
logits := mlx.FromValues([]float32{
10.0, 9.0, 1.0, 0.0,
}, 1, 4)
filtered, indices := Distribution{
Temperature: 1.0,
TopK: 2,
TopP: 0.55,
}.filter(logits)
got := materializeFilteredLogits(filtered, indices, 4)
mlx.Eval(got)
values := got.Floats()
if values[0] != 10.0 {
t.Fatalf("values[0] = %v, want 10", values[0])
}
for i := 1; i < len(values); i++ {
if !math.IsInf(float64(values[i]), -1) {
t.Fatalf("values[%d] = %v, want -Inf", i, values[i])
}
}
}
func materializeFilteredLogits(filtered, indices *mlx.Array, width int) *mlx.Array {
if indices == nil {
return filtered
}
base := mlx.AddScalar(mlx.Zeros(mlx.DTypeFloat32, 1, width), float32(math.Inf(-1)))
return base.PutAlongAxis(indices, filtered, -1)
}

View File

@@ -16,89 +16,12 @@ import (
"strconv"
"time"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/x/mlxrunner/mlx"
"github.com/ollama/ollama/x/mlxrunner/model/base"
"github.com/ollama/ollama/x/mlxrunner/sample"
"github.com/ollama/ollama/x/models/qwen3_5"
)
type samplingConfig struct {
temperature float32
topP float32
minP float32
topK int
repeatLastN int
repeatPenalty float32
presencePenalty float32
frequencyPenalty float32
}
func defaultSamplingConfig(m base.Model, think *bool) samplingConfig {
if _, ok := m.(*qwen3_5.Model); ok {
cfg := samplingConfig{
temperature: 1.0,
topP: 0.95,
minP: 0.0,
topK: 20,
repeatLastN: 64,
repeatPenalty: 1.0,
presencePenalty: 1.5,
frequencyPenalty: 0.0,
}
if think != nil && !*think {
cfg.temperature = 0.7
cfg.topP = 0.8
}
return cfg
}
opts := api.DefaultOptions()
return samplingConfig{
temperature: opts.Temperature,
topP: opts.TopP,
minP: opts.MinP,
topK: opts.TopK,
repeatLastN: opts.RepeatLastN,
repeatPenalty: opts.RepeatPenalty,
presencePenalty: opts.PresencePenalty,
frequencyPenalty: opts.FrequencyPenalty,
}
}
func resolveSamplingConfig(m base.Model, req Request) samplingConfig {
cfg := defaultSamplingConfig(m, req.Think)
if req.Options.Temperature != nil {
cfg.temperature = *req.Options.Temperature
}
if req.Options.TopP != nil {
cfg.topP = *req.Options.TopP
}
if req.Options.MinP != nil {
cfg.minP = *req.Options.MinP
}
if req.Options.TopK != nil {
cfg.topK = *req.Options.TopK
}
if req.Options.RepeatLastN != nil {
cfg.repeatLastN = *req.Options.RepeatLastN
}
if req.Options.RepeatPenalty != nil {
cfg.repeatPenalty = *req.Options.RepeatPenalty
}
if req.Options.PresencePenalty != nil {
cfg.presencePenalty = *req.Options.PresencePenalty
}
if req.Options.FrequencyPenalty != nil {
cfg.frequencyPenalty = *req.Options.FrequencyPenalty
}
return cfg
}
func Execute(args []string) error {
slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
@@ -167,18 +90,12 @@ func Execute(args []string) error {
request.Options.MaxTokens = cmp.Or(request.Options.MaxTokens, request.Options.NumPredict)
sampling := resolveSamplingConfig(runner.Model, request)
request.Pipeline = runner.TextGenerationPipeline
request.Sampler = sample.New(
sampling.temperature,
sampling.topP,
sampling.minP,
sampling.topK,
sampling.repeatLastN,
sampling.repeatPenalty,
sampling.presencePenalty,
sampling.frequencyPenalty,
request.Options.Temperature,
request.Options.TopP,
request.Options.MinP,
request.Options.TopK,
)
var cancel context.CancelFunc

View File

@@ -1,172 +0,0 @@
//go:build mlx
package mlxrunner
import (
"testing"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/x/mlxrunner/cache"
"github.com/ollama/ollama/x/mlxrunner/mlx"
"github.com/ollama/ollama/x/mlxrunner/model/base"
"github.com/ollama/ollama/x/models/qwen3_5"
"github.com/ollama/ollama/x/tokenizer"
)
type stubModel struct{}
func (stubModel) Forward(*mlx.Array, []cache.Cache) *mlx.Array { return nil }
func (stubModel) Unembed(*mlx.Array) *mlx.Array { return nil }
func (stubModel) NumLayers() int { return 0 }
func (stubModel) Tokenizer() *tokenizer.Tokenizer { return nil }
func (stubModel) LoadWeights(map[string]*mlx.Array) error { return nil }
func TestResolveSamplingConfigDefaults(t *testing.T) {
trueValue := true
falseValue := false
tests := []struct {
name string
model base.Model
req Request
want samplingConfig
}{
{
name: "generic model uses api defaults",
model: stubModel{},
req: Request{},
want: samplingConfig{
temperature: 0.8,
topP: 0.9,
minP: 0.0,
topK: 40,
repeatLastN: 64,
repeatPenalty: 1.1,
presencePenalty: 0.0,
frequencyPenalty: 0.0,
},
},
{
name: "qwen3.5 defaults to thinking profile when think unset",
model: &qwen3_5.Model{},
req: Request{},
want: samplingConfig{
temperature: 1.0,
topP: 0.95,
minP: 0.0,
topK: 20,
repeatLastN: 64,
repeatPenalty: 1.0,
presencePenalty: 1.5,
frequencyPenalty: 0.0,
},
},
{
name: "qwen3.5 thinking disabled defaults",
model: &qwen3_5.Model{},
req: Request{TextCompletionsRequest: TextCompletionsRequest{Think: &falseValue}},
want: samplingConfig{
temperature: 0.7,
topP: 0.8,
minP: 0.0,
topK: 20,
repeatLastN: 64,
repeatPenalty: 1.0,
presencePenalty: 1.5,
frequencyPenalty: 0.0,
},
},
{
name: "qwen3.5 thinking enabled defaults",
model: &qwen3_5.Model{},
req: Request{TextCompletionsRequest: TextCompletionsRequest{Think: &trueValue}},
want: samplingConfig{
temperature: 1.0,
topP: 0.95,
minP: 0.0,
topK: 20,
repeatLastN: 64,
repeatPenalty: 1.0,
presencePenalty: 1.5,
frequencyPenalty: 0.0,
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := resolveSamplingConfig(tt.model, tt.req); got != tt.want {
t.Fatalf("resolveSamplingConfig() = %+v, want %+v", got, tt.want)
}
})
}
}
func TestResolveSamplingConfigOverridesSpecifiedValues(t *testing.T) {
trueValue := true
temperature := float32(0.4)
topP := float32(0.6)
minP := float32(0.05)
topK := 12
repeatLastN := 32
repeatPenalty := float32(1.1)
presencePenalty := float32(0.7)
frequencyPenalty := float32(0.2)
got := resolveSamplingConfig(stubModel{}, Request{
TextCompletionsRequest: TextCompletionsRequest{
Think: &trueValue,
Options: struct {
Temperature *float32 `json:"temperature"`
TopP *float32 `json:"top_p"`
MinP *float32 `json:"min_p"`
TopK *int `json:"top_k"`
RepeatLastN *int `json:"repeat_last_n"`
RepeatPenalty *float32 `json:"repeat_penalty"`
PresencePenalty *float32 `json:"presence_penalty"`
FrequencyPenalty *float32 `json:"frequency_penalty"`
MaxTokens int `json:"max_tokens"`
NumPredict int `json:"num_predict"`
}{
Temperature: &temperature,
TopP: &topP,
MinP: &minP,
TopK: &topK,
RepeatLastN: &repeatLastN,
RepeatPenalty: &repeatPenalty,
PresencePenalty: &presencePenalty,
FrequencyPenalty: &frequencyPenalty,
},
},
})
want := samplingConfig{
temperature: temperature,
topP: topP,
minP: minP,
topK: topK,
repeatLastN: repeatLastN,
repeatPenalty: repeatPenalty,
presencePenalty: presencePenalty,
frequencyPenalty: frequencyPenalty,
}
if got != want {
t.Fatalf("resolveSamplingConfig() = %+v, want %+v", got, want)
}
}
func TestResolveSamplingConfigMatchesGenericDefaults(t *testing.T) {
want := api.DefaultOptions()
got := defaultSamplingConfig(stubModel{}, nil)
if got.temperature != want.Temperature ||
got.topP != want.TopP ||
got.minP != want.MinP ||
got.topK != want.TopK ||
got.repeatLastN != want.RepeatLastN ||
got.repeatPenalty != want.RepeatPenalty ||
got.presencePenalty != want.PresencePenalty ||
got.frequencyPenalty != want.FrequencyPenalty {
t.Fatalf("defaultSamplingConfig() = %+v, want api defaults %+v", got, want)
}
}

View File

@@ -437,15 +437,15 @@ func freeTensorKeys(tensors map[string]*mlx.Array, keys ...string) {
}
}
func stackAndDetach(parts []*mlx.Array) *mlx.Array {
func stackAndClone(parts []*mlx.Array) *mlx.Array {
if len(parts) == 0 {
return nil
}
stacked := mlx.Stack(parts, 0)
detached := mlx.Detach(stacked)
mlx.Eval(detached)
cloned := stacked.Clone()
mlx.Eval(cloned)
mlx.Unpin(stacked)
return detached
return cloned
}
func transposeExpertWeightForGatherMM(w *mlx.Array) *mlx.Array {
@@ -453,10 +453,10 @@ func transposeExpertWeightForGatherMM(w *mlx.Array) *mlx.Array {
return w
}
t := mlx.Transpose(w, 0, 2, 1)
d := mlx.Detach(t)
mlx.Eval(d)
cloned := t.Clone()
mlx.Eval(cloned)
mlx.Unpin(t)
return d
return cloned
}
func describeMoEProjection(prefix string, w *stackedExpertWeights) string {
@@ -612,12 +612,12 @@ func collectPerExpertProjection(tensors map[string]*mlx.Array, cfg *Config, useQ
return nil
}
out := &stackedExpertWeights{Weight: stackAndDetach(weights), Bits: bits, GroupSize: groupSize, Mode: mode}
out := &stackedExpertWeights{Weight: stackAndClone(weights), Bits: bits, GroupSize: groupSize, Mode: mode}
if len(scales) == len(weights) {
out.Scales = stackAndDetach(scales)
out.Scales = stackAndClone(scales)
}
if len(biases) == len(weights) {
out.Biases = stackAndDetach(biases)
out.Biases = stackAndClone(biases)
}
freeTensorKeys(tensors, consumedKeys...)
return out
@@ -1073,16 +1073,6 @@ func softplus(x *mlx.Array) *mlx.Array {
return mlx.Log(mlx.AddScalar(mlx.Exp(x), 1.0))
}
func repeatHeads(x *mlx.Array, repeatFactor int32) *mlx.Array {
if repeatFactor <= 1 {
return x
}
shape := x.Dims()
x = mlx.ExpandDims(x, 3)
x = mlx.Tile(x, []int32{1, 1, 1, repeatFactor, 1})
return mlx.Reshape(x, int32(shape[0]), int32(shape[1]), int32(shape[2])*repeatFactor, int32(shape[3]))
}
func depthwiseCausalConv1d(x, w *mlx.Array, outLen int32) *mlx.Array {
if x == nil || w == nil {
return nil
@@ -1235,7 +1225,6 @@ func (g *GatedDeltaNet) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Co
k = mlx.Reshape(k, B, L, cfg.LinearNumKeyHeads, cfg.LinearKeyHeadDim)
v = mlx.Reshape(v, B, L, cfg.LinearNumValueHeads, cfg.LinearValueHeadDim)
invScale := float32(1.0 / math.Sqrt(float64(cfg.LinearKeyHeadDim)))
repeatFactor := cfg.LinearNumValueHeads / cfg.LinearNumKeyHeads
q = mlx.MulScalar(mlx.RMSNormFn(q, nil, 1e-6), invScale*invScale)
k = mlx.MulScalar(mlx.RMSNormFn(k, nil, 1e-6), invScale)
@@ -1256,50 +1245,7 @@ func (g *GatedDeltaNet) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Co
state = mlx.Zeros(x.DType(), int(B), int(cfg.LinearNumValueHeads), int(cfg.LinearValueHeadDim), int(cfg.LinearKeyHeadDim))
}
var out *mlx.Array
if fusedOut, fusedState, fused := mlx.GatedDeltaKernel(q, k, v, gDecay, beta, state); fused {
out = fusedOut
state = fusedState
} else if L == 1 {
if repeatFactor > 1 {
q = repeatHeads(q, repeatFactor)
k = repeatHeads(k, repeatFactor)
}
// Fast decode path: avoid per-token slice/append graph construction.
qt := mlx.Squeeze(q, 1)
kt := mlx.Squeeze(k, 1)
vt := mlx.Squeeze(v, 1)
gt := mlx.Squeeze(gDecay, 1)
bt := mlx.Squeeze(beta, 1)
state = mlx.Mul(state, mlx.ExpandDims(mlx.ExpandDims(gt, -1), -1))
kvMem := mlx.Sum(mlx.Mul(state, mlx.ExpandDims(kt, 2)), -1, false)
delta := mlx.Mul(mlx.Sub(vt, kvMem), mlx.ExpandDims(bt, -1))
state = mlx.Add(state, mlx.Mul(mlx.ExpandDims(kt, 2), mlx.ExpandDims(delta, -1)))
yt := mlx.Sum(mlx.Mul(state, mlx.ExpandDims(qt, 2)), -1, false)
out = mlx.ExpandDims(yt, 1)
} else {
if repeatFactor > 1 {
q = repeatHeads(q, repeatFactor)
k = repeatHeads(k, repeatFactor)
}
outs := make([]*mlx.Array, 0, L)
for t := int32(0); t < L; t++ {
qt := mlx.Squeeze(mlx.SliceStartStop(q, []int32{0, t, 0, 0}, []int32{B, t + 1, cfg.LinearNumValueHeads, cfg.LinearKeyHeadDim}), 1)
kt := mlx.Squeeze(mlx.SliceStartStop(k, []int32{0, t, 0, 0}, []int32{B, t + 1, cfg.LinearNumValueHeads, cfg.LinearKeyHeadDim}), 1)
vt := mlx.Squeeze(mlx.SliceStartStop(v, []int32{0, t, 0, 0}, []int32{B, t + 1, cfg.LinearNumValueHeads, cfg.LinearValueHeadDim}), 1)
gt := mlx.Squeeze(mlx.SliceStartStop(gDecay, []int32{0, t, 0}, []int32{B, t + 1, cfg.LinearNumValueHeads}), 1)
bt := mlx.Squeeze(mlx.SliceStartStop(beta, []int32{0, t, 0}, []int32{B, t + 1, cfg.LinearNumValueHeads}), 1)
state = mlx.Mul(state, mlx.ExpandDims(mlx.ExpandDims(gt, -1), -1))
kvMem := mlx.Sum(mlx.Mul(state, mlx.ExpandDims(kt, 2)), -1, false)
delta := mlx.Mul(mlx.Sub(vt, kvMem), mlx.ExpandDims(bt, -1))
state = mlx.Add(state, mlx.Mul(mlx.ExpandDims(kt, 2), mlx.ExpandDims(delta, -1)))
yt := mlx.Sum(mlx.Mul(state, mlx.ExpandDims(qt, 2)), -1, false)
outs = append(outs, mlx.ExpandDims(yt, 1))
}
out = mlx.Concatenate(outs, 1)
}
out, state := mlx.GatedDelta(q, k, v, gDecay, beta, state)
out = mlx.RMSNormFn(out, g.NormWeight, cfg.RMSNormEps)
out = mlx.Mul(out, mlx.SiLU(z))
out = mlx.Reshape(out, B, L, valueDim)
@@ -1432,6 +1378,10 @@ func (m *Model) NumLayers() int {
return len(m.Layers)
}
func (m *Model) MaxContextLength() int {
return int(m.MaxPositionEmbeddings)
}
func (m *Model) Tokenizer() *tokenizer.Tokenizer {
return m.tok
}
@@ -1449,9 +1399,3 @@ func (m *Model) NewCaches() []cache.Cache {
}
return caches
}
// DisablePromptCache returns false to allow append-only prompt cache reuse.
// Recurrent caches report CanTrim=false, so divergent prefixes are dropped.
func (m *Model) DisablePromptCache() bool {
return false
}

View File

@@ -126,13 +126,6 @@ func TestResolveTensorPathLayout(t *testing.T) {
}
}
func TestModelRuntimeDefaults(t *testing.T) {
m := &Model{}
if m.DisablePromptCache() {
t.Fatal("DisablePromptCache() = true, want false")
}
}
func TestNewCachesLayout(t *testing.T) {
m := &Model{
Config: &Config{