Add qwen3.5-next-moe support to MLX runner and models

This change: * adds support for qwen3.5-next-moe models (qwen3-next/qwen3.5-next/qwen3-coder) to the MLX runner * introduces recurrent cache support and related MLX ops * updates pipeline/runner integration and adds tests
consolidate the tokenizer (#14327 )
2026-02-23 18:46:44 -05:00 · 2026-02-20 17:25:23 -08:00 · 2026-02-19 15:55:45 -08:00 · 2026-02-19 10:55:02 -08:00
29 changed files with 3785 additions and 33 deletions
--- a/x/mlxrunner/cache/cache.go
+++ b/x/mlxrunner/cache/cache.go
@@ -4,13 +4,19 @@ package cache

 import (
 	"log/slog"
+	"os"

 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 )

+func kvCacheGrowDebugEnabled() bool {
+	return os.Getenv("OLLAMA_MLX_DEBUG_CACHE_GROW") != ""
+}
+
 type Cache interface {
 	Update(keys, values *mlx.Array) (newKeys, newValues *mlx.Array)
 	State() (keys, values *mlx.Array)
+	Materialize() []*mlx.Array
 	Trim(int) int
 	Clone() Cache
 	Offset() int
@@ -48,6 +54,9 @@ func (c *KVCache) Update(keys, values *mlx.Array) (*mlx.Array, *mlx.Array) {
 		} else {
 			c.keys, c.values = newKeys, newValues
 		}
+		if kvCacheGrowDebugEnabled() {
+			slog.Info("KVCache grow", "prev", prev, "new_capacity", c.keys.Dim(2), "step", c.step)
+		}
 	}

 	c.offset += L
@@ -66,6 +75,17 @@ func (c *KVCache) State() (*mlx.Array, *mlx.Array) {
 		c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.offset), mlx.Slice())
 }

+func (c *KVCache) Materialize() []*mlx.Array {
+	out := make([]*mlx.Array, 0, 2)
+	if c.keys != nil && c.keys.Valid() {
+		out = append(out, c.keys)
+	}
+	if c.values != nil && c.values.Valid() {
+		out = append(out, c.values)
+	}
+	return out
+}
+
 func (c *KVCache) Trim(n int) int {
 	n = min(c.offset, n)
 	c.offset -= n
--- a/x/mlxrunner/cache/cache_test.go
+++ b/x/mlxrunner/cache/cache_test.go
@@ -0,0 +1,17 @@
+//go:build mlx
+
+package cache
+
+import "testing"
+
+func TestKVCacheGrowDebugEnabled(t *testing.T) {
+	t.Setenv("OLLAMA_MLX_DEBUG_CACHE_GROW", "")
+	if kvCacheGrowDebugEnabled() {
+		t.Fatal("kvCacheGrowDebugEnabled() = true, want false")
+	}
+
+	t.Setenv("OLLAMA_MLX_DEBUG_CACHE_GROW", "1")
+	if !kvCacheGrowDebugEnabled() {
+		t.Fatal("kvCacheGrowDebugEnabled() = false, want true")
+	}
+}
--- a/x/mlxrunner/cache/recurrent.go
+++ b/x/mlxrunner/cache/recurrent.go
@@ -0,0 +1,162 @@
+//go:build mlx
+
+package cache
+
+import "github.com/ollama/ollama/x/mlxrunner/mlx"
+
+// RecurrentCache stores state for linear-recurrent layers.
+//
+// Conv state shape: [B, convTail, convDim]
+// Delta state shape: [B, numVHeads, headVDim, headKDim]
+type RecurrentCache struct {
+	convState  *mlx.Array
+	deltaState *mlx.Array
+	offset     int
+
+	convTail  int
+	convDim   int
+	numVHeads int
+	headVDim  int
+	headKDim  int
+}
+
+func (c *RecurrentCache) setStateMaterialized(dst **mlx.Array, v *mlx.Array) {
+	if v == nil || !v.Valid() {
+		return
+	}
+	if *dst == v {
+		return
+	}
+
+	// Break dependency chains so recurrent state does not retain the full
+	// per-token compute graph over time.
+	snap := mlx.Snapshot(v)
+	mlx.Eval(snap)
+
+	old := *dst
+	*dst = snap
+
+	// Release previous cached state root, then recursively free the transient
+	// incoming graph root now that a detached snapshot is retained in cache.
+	if old != nil && old != snap {
+		mlx.Release(old)
+	}
+	if v != snap && v != old {
+		mlx.Free(v)
+	}
+}
+
+func (c *RecurrentCache) setStateRaw(dst **mlx.Array, v *mlx.Array) {
+	if v == nil || !v.Valid() {
+		return
+	}
+	old := *dst
+	*dst = v
+	if old != nil && old != v {
+		mlx.Release(old)
+	}
+}
+
+func NewRecurrentCache(convTail, convDim, numVHeads, headVDim, headKDim int32) *RecurrentCache {
+	return &RecurrentCache{
+		convTail:  int(convTail),
+		convDim:   int(convDim),
+		numVHeads: int(numVHeads),
+		headVDim:  int(headVDim),
+		headKDim:  int(headKDim),
+	}
+}
+
+func (c *RecurrentCache) ensure(batch int, dtype mlx.DType) {
+	if batch <= 0 {
+		batch = 1
+	}
+
+	if c.convState == nil || c.convState.DType() != dtype ||
+		c.convState.Dim(0) != batch || c.convState.Dim(1) != c.convTail || c.convState.Dim(2) != c.convDim {
+		c.setStateRaw(&c.convState, mlx.Zeros(dtype, batch, c.convTail, c.convDim))
+	}
+
+	if c.deltaState == nil || c.deltaState.DType() != dtype ||
+		c.deltaState.Dim(0) != batch || c.deltaState.Dim(1) != c.numVHeads || c.deltaState.Dim(2) != c.headVDim || c.deltaState.Dim(3) != c.headKDim {
+		c.setStateRaw(&c.deltaState, mlx.Zeros(dtype, batch, c.numVHeads, c.headVDim, c.headKDim))
+	}
+}
+
+func (c *RecurrentCache) ConvState(batch int, dtype mlx.DType) *mlx.Array {
+	c.ensure(batch, dtype)
+	return c.convState
+}
+
+func (c *RecurrentCache) SetConvState(v *mlx.Array) {
+	c.setStateMaterialized(&c.convState, v)
+}
+
+func (c *RecurrentCache) DeltaState(batch int, dtype mlx.DType) *mlx.Array {
+	c.ensure(batch, dtype)
+	return c.deltaState
+}
+
+func (c *RecurrentCache) SetDeltaState(v *mlx.Array) {
+	c.setStateMaterialized(&c.deltaState, v)
+}
+
+func (c *RecurrentCache) Advance(n int) {
+	c.offset += n
+}
+
+func (c *RecurrentCache) Update(keys, values *mlx.Array) (*mlx.Array, *mlx.Array) {
+	return keys, values
+}
+
+func (c *RecurrentCache) State() (*mlx.Array, *mlx.Array) {
+	c.ensure(1, mlx.DTypeFloat32)
+	return c.convState, c.deltaState
+}
+
+func (c *RecurrentCache) Materialize() []*mlx.Array {
+	out := make([]*mlx.Array, 0, 2)
+	if c.convState != nil && c.convState.Valid() {
+		out = append(out, c.convState)
+	}
+	if c.deltaState != nil && c.deltaState.Valid() {
+		out = append(out, c.deltaState)
+	}
+	return out
+}
+
+func (c *RecurrentCache) Trim(n int) int {
+	n = min(c.offset, n)
+	c.offset -= n
+	// Recurrent state cannot be reversed cheaply; reset to a clean state when trimming.
+	if n > 0 {
+		if c.convState != nil {
+			c.setStateRaw(&c.convState, mlx.Zeros(c.convState.DType(), c.convState.Dim(0), c.convState.Dim(1), c.convState.Dim(2)))
+		}
+		if c.deltaState != nil {
+			c.setStateRaw(&c.deltaState, mlx.Zeros(c.deltaState.DType(), c.deltaState.Dim(0), c.deltaState.Dim(1), c.deltaState.Dim(2), c.deltaState.Dim(3)))
+		}
+	}
+	return n
+}
+
+func (c *RecurrentCache) Clone() Cache {
+	clone := &RecurrentCache{
+		offset:    c.offset,
+		convTail:  c.convTail,
+		convDim:   c.convDim,
+		numVHeads: c.numVHeads,
+		headVDim:  c.headVDim,
+		headKDim:  c.headKDim,
+	}
+	if c.convState != nil {
+		clone.convState = c.convState.Clone()
+	}
+	if c.deltaState != nil {
+		clone.deltaState = c.deltaState.Clone()
+	}
+	return clone
+}
+
+func (c *RecurrentCache) Offset() int { return c.offset }
+func (c *RecurrentCache) Len() int    { return c.offset }
--- a/x/mlxrunner/imports.go
+++ b/x/mlxrunner/imports.go
@@ -7,4 +7,6 @@ import (
 	_ "github.com/ollama/ollama/x/models/glm4_moe_lite"
 	_ "github.com/ollama/ollama/x/models/llama"
 	_ "github.com/ollama/ollama/x/models/qwen3"
+	_ "github.com/ollama/ollama/x/models/qwen3_5"
+	_ "github.com/ollama/ollama/x/models/qwen3_5_moe"
 )
--- a/x/mlxrunner/mlx/array.go
+++ b/x/mlxrunner/mlx/array.go
@@ -272,3 +272,39 @@ func Free(s ...*Array) (n int) {

 	return n
 }
+
+// Release forcibly frees arrays regardless of reference accounting.
+// Use only for arrays that are known to be unreachable by any live model state.
+func Release(s ...*Array) (n int) {
+	seen := make(map[*Array]bool, len(s))
+	for _, t := range s {
+		if t == nil || !t.Valid() || seen[t] {
+			continue
+		}
+		seen[t] = true
+		n += t.NumBytes()
+		C.mlx_array_free(t.ctx)
+		t.ctx.ctx = nil
+		t.desc.inputs = nil
+		t.desc.numRefs = 0
+	}
+	return n
+}
+
+const pinnedNumRefs = 1 << 30
+
+// Pin keeps arrays alive for the process lifetime by setting a very high
+// reference count floor. Use for model parameter tensors shared across many
+// decode steps, where recursive Free traversals must never reclaim them.
+func Pin(s ...*Array) {
+	seen := make(map[*Array]bool, len(s))
+	for _, t := range s {
+		if t == nil || !t.Valid() || seen[t] {
+			continue
+		}
+		seen[t] = true
+		if t.desc.numRefs < pinnedNumRefs {
+			t.desc.numRefs = pinnedNumRefs
+		}
+	}
+}
--- a/x/mlxrunner/mlx/dynamic.go
+++ b/x/mlxrunner/mlx/dynamic.go
@@ -55,6 +55,30 @@ func tryLoadFromDir(dir string) bool {
 	return false
 }

+// tryLoadByName attempts to load the library using just its name,
+// allowing the system to use rpath, LD_LIBRARY_PATH, or standard search paths.
+// Returns true if the library was successfully loaded.
+func tryLoadByName() bool {
+	libraryName := "libmlxc.dylib"
+	if runtime.GOOS == "linux" {
+		libraryName = "libmlxc.so"
+	}
+
+	cPath := C.CString(libraryName)
+	defer C.free(unsafe.Pointer(cPath))
+
+	var handle C.mlx_dynamic_handle
+	if C.mlx_dynamic_load(&handle, cPath) != 0 {
+		return false
+	}
+	if C.mlx_dynamic_load_symbols(handle) != 0 {
+		C.mlx_dynamic_unload(&handle)
+		return false
+	}
+
+	return true
+}
+
 func init() {
 	switch runtime.GOOS {
 	case "darwin":
@@ -73,6 +97,11 @@ func init() {
 		}
 	}

+	// Try loading via rpath/standard library search
+	if tryLoadByName() {
+		return
+	}
+
 	// Build search paths: executable directory, then build directories
 	var searchDirs []string
 	if exe, err := os.Executable(); err == nil {
--- a/x/mlxrunner/mlx/ops_extra.go
+++ b/x/mlxrunner/mlx/ops_extra.go
@@ -279,6 +279,24 @@ func Sigmoid(a *Array) *Array {
 	return a.Sigmoid()
 }

+func Exp(a *Array) *Array {
+	out := New("EXP", a)
+	C.mlx_exp(&out.ctx, a.ctx, DefaultStream().ctx)
+	return out
+}
+
+func Log(a *Array) *Array {
+	out := New("LOG", a)
+	C.mlx_log(&out.ctx, a.ctx, DefaultStream().ctx)
+	return out
+}
+
+func SoftmaxAxis(a *Array, axis int, precise bool) *Array {
+	out := New("SOFTMAX_AXIS", a)
+	C.mlx_softmax_axis(&out.ctx, a.ctx, C.int(axis), C.bool(precise), DefaultStream().ctx)
+	return out
+}
+
 func ScaledDotProductAttentionCausal(q, k, v *Array, scale float32, causalMask bool) *Array {
 	mask := New("")
 	sinks := New("")
@@ -386,6 +404,52 @@ func Collect(v any) []*Array {
 	return arrays
 }

+// Snapshot copies an array into a fresh leaf value with no Go-side graph inputs.
+func Snapshot(a *Array) *Array {
+	if a == nil || !a.Valid() {
+		return a
+	}
+	out := New("SNAPSHOT")
+	C.mlx_copy(&out.ctx, a.ctx, DefaultStream().ctx)
+	return out
+}
+
+// CollectReachable collects arrays from v and all transitive graph inputs.
+func CollectReachable(v any) []*Array {
+	roots := Collect(v)
+	if len(roots) == 0 {
+		return nil
+	}
+
+	seen := make(map[*Array]bool, len(roots))
+	out := make([]*Array, 0, len(roots))
+	stack := append([]*Array(nil), roots...)
+	for len(stack) > 0 {
+		a := stack[len(stack)-1]
+		stack = stack[:len(stack)-1]
+
+		if a == nil || !a.Valid() || seen[a] {
+			continue
+		}
+		seen[a] = true
+		out = append(out, a)
+		stack = append(stack, a.desc.inputs...)
+	}
+
+	return out
+}
+
+// Detach returns a new Array handle that shares the same MLX value but does
+// not retain Go-side graph input references.
+func Detach(a *Array) *Array {
+	if a == nil || !a.Valid() {
+		return a
+	}
+	out := New("DETACH")
+	C.mlx_array_set(&out.ctx, a.ctx)
+	return out
+}
+
 func collect(v reflect.Value, arrays *[]*Array, seen map[uintptr]bool) {
 	if !v.IsValid() {
 		return
--- a/x/mlxrunner/model/base/base.go
+++ b/x/mlxrunner/model/base/base.go
@@ -8,10 +8,10 @@ import (
 	"log/slog"
 	"sync"

-	"github.com/ollama/ollama/x/imagegen/tokenizer"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 	"github.com/ollama/ollama/x/mlxrunner/model"
+	"github.com/ollama/ollama/x/tokenizer"
 )

 // Model is the interface that model implementations must satisfy.
--- a/x/mlxrunner/pipeline.go
+++ b/x/mlxrunner/pipeline.go
@@ -6,13 +6,43 @@ import (
 	"bytes"
 	"errors"
 	"log/slog"
+	"os"
+	"strconv"
 	"time"
-	"unicode/utf8"

 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 )

+func prefillChunkSize(lowMemoryDecode bool) int {
+	if v := os.Getenv("OLLAMA_MLX_PREFILL_CHUNK"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil && n > 0 {
+			return n
+		}
+	}
+
+	if lowMemoryDecode {
+		// Recurrent/no-prompt-cache path favors lower peak memory over prefill throughput.
+		// Keep this conservative to avoid transient prefill spikes and allocator thrash.
+		return 32
+	}
+	return 2 << 10
+}
+
+func mlxDebugMemoryEnabled() bool {
+	return os.Getenv("OLLAMA_MLX_DEBUG_MEMORY") != ""
+}
+
+func finalizeRequestCaches(usePromptCache bool, insertCache func(), freeCaches func(), logMemory func(string, int)) {
+	if usePromptCache {
+		insertCache()
+		logMemory("request_done_cached", -1)
+		return
+	}
+	freeCaches()
+	logMemory("request_done_freed", -1)
+}
+
 func (r *Runner) TextGenerationPipeline(request Request) error {
 	if r.Model == nil {
 		return errors.New("model not loaded")
@@ -30,7 +60,21 @@ func (r *Runner) TextGenerationPipeline(request Request) error {

 	inputs := r.Tokenizer.Encode(request.Prompt, true)

-	caches, tokens := r.FindNearestCache(inputs)
+	usePromptCache := true
+	if m, ok := r.Model.(interface{ DisablePromptCache() bool }); ok && m.DisablePromptCache() {
+		usePromptCache = false
+	}
+	lowMemoryDecode := !usePromptCache
+	prefillChunk := prefillChunkSize(lowMemoryDecode)
+
+	var caches []cache.Cache
+	var tokens []int32
+	if usePromptCache {
+		caches, tokens = r.FindNearestCache(inputs)
+	} else {
+		tokens = inputs
+	}
+
 	if len(caches) == 0 {
 		if cacheFactory, ok := r.Model.(interface{ NewCaches() []cache.Cache }); ok {
 			caches = cacheFactory.NewCaches()
@@ -42,23 +86,54 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 		}
 	}

+	materializeCaches := func() {
+		state := make([]*mlx.Array, 0, 2*len(caches))
+		for _, c := range caches {
+			state = append(state, c.Materialize()...)
+		}
+		if len(state) == 0 {
+			return
+		}
+		mlx.Eval(state...)
+	}
+	freeCaches := func() {
+		state := make([]*mlx.Array, 0, 2*len(caches))
+		for _, c := range caches {
+			state = append(state, c.Materialize()...)
+		}
+		if len(state) == 0 {
+			return
+		}
+		// Non-prompt-cache requests allocate fresh caches every generation.
+		// Explicitly free cache roots so graph chains are reclaimed promptly.
+		mlx.Free(state...)
+		mlx.ClearCache()
+	}
+	debugMemory := mlxDebugMemoryEnabled()
+	logMemory := func(phase string, token int) {
+		if !debugMemory {
+			return
+		}
+		if token >= 0 {
+			slog.Info("MLX memory", "phase", phase, "token", token, "memory", mlx.Memory{})
+			return
+		}
+		slog.Info("MLX memory", "phase", phase, "memory", mlx.Memory{})
+	}
+	logMemory("prefill_start", -1)
+
 	total, processed := len(tokens), 0
 	slog.Info("Prompt processing progress", "processed", processed, "total", total)
 	for total-processed > 1 {
-		n := min(2<<10, total-processed-1)
+		n := min(prefillChunk, total-processed-1)
 		temp := r.Model.Forward(mlx.FromValues(tokens[processed:processed+n], n).ExpandDims(0), caches)
-		defer mlx.Free(temp)
-		mlx.Eval(func() []*mlx.Array {
-			s := make([]*mlx.Array, 2*len(caches))
-			for i, c := range caches {
-				s[2*i], s[2*i+1] = c.State()
-			}
-			return s
-		}()...)
+		materializeCaches()
+		mlx.Free(temp)
 		processed += n
 		slog.Info("Prompt processing progress", "processed", processed, "total", total)
 		mlx.ClearCache()
 	}
+	logMemory("prefill_done", -1)

 	step := func(token *mlx.Array) (*mlx.Array, *mlx.Array) {
 		fwd := r.Model.Forward(token.ExpandDims(0), caches)
@@ -70,7 +145,13 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 	}

 	sample, logprobs := step(mlx.FromValues(tokens[processed:], total-processed))
-	mlx.AsyncEval(sample, logprobs)
+	if !lowMemoryDecode {
+		mlx.AsyncEval(sample, logprobs)
+	} else {
+		// Materialize cache updates to prevent transform graph growth.
+		materializeCaches()
+	}
+	logMemory("decode_init", -1)

 	var b bytes.Buffer

@@ -78,12 +159,10 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 	final := Response{Done: true, PromptTokens: total, CompletionTokens: request.Options.MaxTokens, DoneReason: 1}
 	outputs := make([]int32, 0, request.Options.MaxTokens)
 	for i := range request.Options.MaxTokens {
-		nextSample, nextLogprobs := step(sample)
-		mlx.AsyncEval(nextSample, nextLogprobs)
-
 		if i == 0 {
 			slog.Info("Prompt processing progress", "processed", total, "total", total)
 			mlx.Eval(sample)
+			logMemory("decode_first_eval", i)
 			final.PromptTokensDuration = time.Since(now)
 			now = time.Now()
 		}
@@ -95,6 +174,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 			final.Token = int(output)
 			final.DoneReason = 0
 			final.CompletionTokens = i
+			mlx.Free(sample, logprobs)
 			break
 		}

@@ -103,18 +183,43 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 			Token: int(output),
 		}

+		// For recurrent linear-attention models, avoid async prefetch to reduce
+		// peak memory and clear allocator cache every token.
+		if lowMemoryDecode {
+			mlx.Free(sample, logprobs)
+			if i+1 >= request.Options.MaxTokens {
+				break
+			}
+			mlx.ClearCache()
+			sample, logprobs = step(mlx.FromValues([]int32{output}, 1))
+			// Materialize cache updates to avoid unbounded transform chains.
+			materializeCaches()
+			if i%32 == 0 {
+				logMemory("decode_lowmem_step", i)
+			}
+			continue
+		}
+
+		nextSample, nextLogprobs := step(sample)
+		mlx.AsyncEval(nextSample, nextLogprobs)
+
 		mlx.Free(sample, logprobs)
 		if i%256 == 0 {
 			mlx.ClearCache()
 		}
+		if i%64 == 0 {
+			logMemory("decode_async_step", i)
+		}

 		sample, logprobs = nextSample, nextLogprobs
 	}
-
-	mlx.Free(sample, logprobs)
 	final.CompletionTokensDuration = time.Since(now)
 	request.Responses <- final
-	r.InsertCache(append(inputs, outputs...), caches)
+	finalizeRequestCaches(usePromptCache,
+		func() { r.InsertCache(append(inputs, outputs...), caches) },
+		freeCaches,
+		logMemory,
+	)
 	return nil
 }

@@ -126,13 +231,5 @@ func (r Runner) Decode(sample int32, b *bytes.Buffer) string {
 		return ""
 	}

-	if text := b.String(); utf8.ValidString(text) {
-		b.Reset()
-		return text
-	} else if b.Len() >= utf8.UTFMax {
-		b.Reset()
-		return text
-	}
-
-	return ""
+	return flushValidUTF8Prefix(b)
 }
--- a/x/mlxrunner/pipeline_helpers_test.go
+++ b/x/mlxrunner/pipeline_helpers_test.go
@@ -0,0 +1,83 @@
+//go:build mlx
+
+package mlxrunner
+
+import "testing"
+
+func TestPrefillChunkSize(t *testing.T) {
+	t.Setenv("OLLAMA_MLX_PREFILL_CHUNK", "")
+	if got := prefillChunkSize(false); got != 2<<10 {
+		t.Fatalf("prefillChunkSize(false) = %d, want %d", got, 2<<10)
+	}
+	if got := prefillChunkSize(true); got != 32 {
+		t.Fatalf("prefillChunkSize(true) = %d, want %d", got, 32)
+	}
+}
+
+func TestPrefillChunkSizeEnvOverride(t *testing.T) {
+	t.Setenv("OLLAMA_MLX_PREFILL_CHUNK", "96")
+	if got := prefillChunkSize(false); got != 96 {
+		t.Fatalf("prefillChunkSize(false) with env = %d, want %d", got, 96)
+	}
+	if got := prefillChunkSize(true); got != 96 {
+		t.Fatalf("prefillChunkSize(true) with env = %d, want %d", got, 96)
+	}
+}
+
+func TestMLXDebugMemoryEnabled(t *testing.T) {
+	t.Setenv("OLLAMA_MLX_DEBUG_MEMORY", "")
+	if mlxDebugMemoryEnabled() {
+		t.Fatal("mlxDebugMemoryEnabled() = true, want false")
+	}
+
+	t.Setenv("OLLAMA_MLX_DEBUG_MEMORY", "1")
+	if !mlxDebugMemoryEnabled() {
+		t.Fatal("mlxDebugMemoryEnabled() = false, want true")
+	}
+}
+
+func TestFinalizeRequestCachesUsesPromptCachePath(t *testing.T) {
+	insertCalls := 0
+	freeCalls := 0
+	logPhase := ""
+
+	finalizeRequestCaches(
+		true,
+		func() { insertCalls++ },
+		func() { freeCalls++ },
+		func(phase string, _ int) { logPhase = phase },
+	)
+
+	if insertCalls != 1 {
+		t.Fatalf("insert calls = %d, want 1", insertCalls)
+	}
+	if freeCalls != 0 {
+		t.Fatalf("free calls = %d, want 0", freeCalls)
+	}
+	if logPhase != "request_done_cached" {
+		t.Fatalf("log phase = %q, want %q", logPhase, "request_done_cached")
+	}
+}
+
+func TestFinalizeRequestCachesUsesFreePath(t *testing.T) {
+	insertCalls := 0
+	freeCalls := 0
+	logPhase := ""
+
+	finalizeRequestCaches(
+		false,
+		func() { insertCalls++ },
+		func() { freeCalls++ },
+		func(phase string, _ int) { logPhase = phase },
+	)
+
+	if insertCalls != 0 {
+		t.Fatalf("insert calls = %d, want 0", insertCalls)
+	}
+	if freeCalls != 1 {
+		t.Fatalf("free calls = %d, want 1", freeCalls)
+	}
+	if logPhase != "request_done_freed" {
+		t.Fatalf("log phase = %q, want %q", logPhase, "request_done_freed")
+	}
+}
--- a/x/mlxrunner/runner.go
+++ b/x/mlxrunner/runner.go
@@ -12,12 +12,12 @@ import (

 	"golang.org/x/sync/errgroup"

-	"github.com/ollama/ollama/x/imagegen/tokenizer"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 	"github.com/ollama/ollama/x/mlxrunner/model"
 	"github.com/ollama/ollama/x/mlxrunner/model/base"
 	"github.com/ollama/ollama/x/mlxrunner/sample"
+	"github.com/ollama/ollama/x/tokenizer"
 )

 type Request struct {
@@ -64,6 +64,38 @@ type Runner struct {
 	CacheEntries map[int32]*CacheEntry
 }

+func releaseTensorMap(tensors map[string]*mlx.Array, keep map[*mlx.Array]struct{}) (count int, bytes int) {
+	if len(tensors) == 0 {
+		return 0, 0
+	}
+
+	seen := make(map[*mlx.Array]bool, len(tensors))
+	toRelease := make([]*mlx.Array, 0, len(tensors))
+	for name, arr := range tensors {
+		if arr == nil || !arr.Valid() {
+			delete(tensors, name)
+			continue
+		}
+		if keep != nil {
+			if _, ok := keep[arr]; ok {
+				continue
+			}
+		}
+		delete(tensors, name)
+		if seen[arr] {
+			continue
+		}
+		seen[arr] = true
+		toRelease = append(toRelease, arr)
+	}
+
+	if len(toRelease) == 0 {
+		return 0, 0
+	}
+
+	return len(toRelease), mlx.Release(toRelease...)
+}
+
 func (r *Runner) Load(modelName string) error {
 	root, err := model.Open(modelName)
 	if err != nil {
@@ -85,9 +117,29 @@ func (r *Runner) Load(modelName string) error {
 	// Assign weights to model (model-specific logic)
 	loadWeights := base.Weights(m)
 	if err := loadWeights(tensors); err != nil {
+		if count, bytes := releaseTensorMap(tensors, nil); count > 0 {
+			slog.Info("Released tensors after load failure", "count", count, "bytes", mlx.PrettyBytes(bytes))
+			mlx.ClearCache()
+		}
 		return err
 	}

+	// Pin only model-owned tensor roots. Pinning the full transitive graph can
+	// retain large load-time intermediates and inflate steady-state memory.
+	roots := mlx.Collect(m)
+	mlx.Pin(roots...)
+
+	keep := make(map[*mlx.Array]struct{})
+	for _, arr := range roots {
+		if arr != nil && arr.Valid() {
+			keep[arr] = struct{}{}
+		}
+	}
+	if count, bytes := releaseTensorMap(tensors, keep); count > 0 {
+		slog.Info("Released unused model tensors", "count", count, "bytes", mlx.PrettyBytes(bytes))
+		mlx.ClearCache()
+	}
+
 	r.Model = m
 	r.Tokenizer = m.Tokenizer()
 	return nil
--- a/x/mlxrunner/utf8_buffer.go
+++ b/x/mlxrunner/utf8_buffer.go
@@ -0,0 +1,47 @@
+package mlxrunner
+
+import (
+	"bytes"
+	"unicode/utf8"
+)
+
+// flushValidUTF8Prefix returns and consumes the longest valid UTF-8 prefix
+// currently buffered, leaving any incomplete trailing bytes in place.
+func flushValidUTF8Prefix(b *bytes.Buffer) string {
+	data := b.Bytes()
+	if len(data) == 0 {
+		return ""
+	}
+
+	prefix := validUTF8PrefixLen(data)
+	if prefix == 0 {
+		return ""
+	}
+
+	text := string(data[:prefix])
+	b.Next(prefix)
+	return text
+}
+
+func validUTF8PrefixLen(data []byte) int {
+	i := 0
+	prefix := 0
+	for i < len(data) {
+		r, size := utf8.DecodeRune(data[i:])
+		if r == utf8.RuneError && size == 1 {
+			if !utf8.FullRune(data[i:]) {
+				break
+			}
+
+			// Invalid UTF-8 byte; consume one byte to guarantee forward progress.
+			i++
+			prefix = i
+			continue
+		}
+
+		i += size
+		prefix = i
+	}
+
+	return prefix
+}
--- a/x/mlxrunner/utf8_buffer_test.go
+++ b/x/mlxrunner/utf8_buffer_test.go
@@ -0,0 +1,46 @@
+package mlxrunner
+
+import (
+	"bytes"
+	"testing"
+)
+
+func TestFlushValidUTF8Prefix_PreservesIncompleteRune(t *testing.T) {
+	var b bytes.Buffer
+
+	b.Write([]byte{0xE3, 0x81})
+	if got := flushValidUTF8Prefix(&b); got != "" {
+		t.Fatalf("first flush = %q, want empty", got)
+	}
+
+	b.Write([]byte{0x93, 0xE3})
+	if got := flushValidUTF8Prefix(&b); got != "こ" {
+		t.Fatalf("second flush = %q, want %q", got, "こ")
+	}
+
+	if got := b.Bytes(); !bytes.Equal(got, []byte{0xE3}) {
+		t.Fatalf("buffer after second flush = %v, want %v", got, []byte{0xE3})
+	}
+
+	b.Write([]byte{0x82, 0x93})
+	if got := flushValidUTF8Prefix(&b); got != "ん" {
+		t.Fatalf("third flush = %q, want %q", got, "ん")
+	}
+
+	if b.Len() != 0 {
+		t.Fatalf("buffer not empty after third flush: %d", b.Len())
+	}
+}
+
+func TestFlushValidUTF8Prefix_ValidText(t *testing.T) {
+	var b bytes.Buffer
+	b.WriteString("hello 世界")
+
+	if got := flushValidUTF8Prefix(&b); got != "hello 世界" {
+		t.Fatalf("flush = %q, want %q", got, "hello 世界")
+	}
+
+	if b.Len() != 0 {
+		t.Fatalf("buffer not empty after flush: %d", b.Len())
+	}
+}
--- a/x/models/gemma3/gemma3.go
+++ b/x/models/gemma3/gemma3.go
@@ -8,12 +8,12 @@ import (
 	"fmt"
 	"math"

-	"github.com/ollama/ollama/x/imagegen/tokenizer"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 	"github.com/ollama/ollama/x/mlxrunner/model"
 	"github.com/ollama/ollama/x/mlxrunner/model/base"
 	"github.com/ollama/ollama/x/models/nn"
+	"github.com/ollama/ollama/x/tokenizer"
 )

 func init() {
--- a/x/models/glm4_moe_lite/glm4_moe_lite.go
+++ b/x/models/glm4_moe_lite/glm4_moe_lite.go
@@ -9,12 +9,12 @@ import (
 	"fmt"
 	"math"

-	"github.com/ollama/ollama/x/imagegen/tokenizer"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 	"github.com/ollama/ollama/x/mlxrunner/model"
 	"github.com/ollama/ollama/x/mlxrunner/model/base"
 	"github.com/ollama/ollama/x/models/nn"
+	"github.com/ollama/ollama/x/tokenizer"
 )

 func init() {
--- a/x/models/llama/llama.go
+++ b/x/models/llama/llama.go
@@ -8,12 +8,12 @@ import (
 	"fmt"
 	"math"

-	"github.com/ollama/ollama/x/imagegen/tokenizer"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 	"github.com/ollama/ollama/x/mlxrunner/model"
 	"github.com/ollama/ollama/x/mlxrunner/model/base"
 	"github.com/ollama/ollama/x/models/nn"
+	"github.com/ollama/ollama/x/tokenizer"
 )

 func init() {
--- a/x/models/qwen3/qwen3.go
+++ b/x/models/qwen3/qwen3.go
@@ -8,12 +8,12 @@ import (
 	"fmt"
 	"math"

-	"github.com/ollama/ollama/x/imagegen/tokenizer"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 	"github.com/ollama/ollama/x/mlxrunner/model"
 	"github.com/ollama/ollama/x/mlxrunner/model/base"
 	"github.com/ollama/ollama/x/models/nn"
+	"github.com/ollama/ollama/x/tokenizer"
 )

 func init() {
--- a/x/models/qwen3_5/qwen3_5.go
+++ b/x/models/qwen3_5/qwen3_5.go
--- a/x/models/qwen3_5/qwen3_5_test.go
+++ b/x/models/qwen3_5/qwen3_5_test.go
@@ -0,0 +1,120 @@
+//go:build mlx
+
+package qwen3_5
+
+import (
+	"testing"
+
+	"github.com/ollama/ollama/x/mlxrunner/cache"
+)
+
+func TestParseConfigNestedDefaults(t *testing.T) {
+	data := []byte(`{
+		"model_type": "Qwen3_5MoeForConditionalGeneration",
+		"text_config": {
+			"hidden_size": 4096,
+			"intermediate_size": 14336,
+			"num_hidden_layers": 8,
+			"num_attention_heads": 32,
+			"num_key_value_heads": 8,
+			"head_dim": 128,
+			"linear_num_value_heads": 64,
+			"linear_num_key_heads": 16,
+			"linear_key_head_dim": 128,
+			"linear_value_head_dim": 128,
+			"linear_conv_kernel_dim": 4,
+			"num_experts": 16,
+			"num_experts_per_tok": 4,
+			"moe_intermediate_size": 2048,
+			"shared_expert_intermediate_size": 4096,
+			"rope_parameters": {
+				"rope_theta": 500000,
+				"partial_rotary_factor": 0.5
+			}
+		}
+	}`)
+
+	cfg, err := parseConfig(data)
+	if err != nil {
+		t.Fatalf("parseConfig failed: %v", err)
+	}
+
+	if cfg.RopeTheta != 500000 {
+		t.Fatalf("rope theta mismatch: got %v", cfg.RopeTheta)
+	}
+	if cfg.RopeDim != 64 {
+		t.Fatalf("rope dim mismatch: got %d want 64", cfg.RopeDim)
+	}
+	if cfg.FullAttentionInterval != 4 {
+		t.Fatalf("full_attention_interval default mismatch: got %d want 4", cfg.FullAttentionInterval)
+	}
+	if !cfg.NormTopKProb {
+		t.Fatalf("norm_topk_prob should default to true for MoE")
+	}
+}
+
+func TestLayerSelectionHelpers(t *testing.T) {
+	cfg := &Config{
+		NumHiddenLayers:       6,
+		FullAttentionInterval: 3,
+		NumExperts:            8,
+		DecoderSparseStep:     2,
+		MLPOnlyLayers:         []int32{1},
+	}
+
+	if !layerIsLinear(cfg, 0) {
+		t.Fatalf("layer 0 should be linear")
+	}
+	if layerIsLinear(cfg, 2) {
+		t.Fatalf("layer 2 should be full attention")
+	}
+
+	if layerUsesMoE(cfg, 1) {
+		t.Fatalf("layer 1 should be forced dense by mlp_only_layers")
+	}
+	if !layerUsesMoE(cfg, 3) {
+		t.Fatalf("layer 3 should use moe with decoder_sparse_step=2")
+	}
+}
+
+func TestModelRuntimeToggles(t *testing.T) {
+	m := &Model{}
+	if !m.DisablePromptCache() {
+		t.Fatal("DisablePromptCache() = false, want true")
+	}
+	if m.EnableCompile() {
+		t.Fatal("EnableCompile() = true, want false")
+	}
+}
+
+func TestNewCachesLayout(t *testing.T) {
+	m := &Model{
+		Config: &Config{
+			LinearConvKernelDim: 4,
+			LinearNumKeyHeads:   2,
+			LinearKeyHeadDim:    8,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  16,
+		},
+		Layers: []*Layer{
+			{IsLinear: true},
+			{IsLinear: false},
+			{IsLinear: true},
+		},
+	}
+
+	caches := m.NewCaches()
+	if len(caches) != len(m.Layers) {
+		t.Fatalf("len(caches) = %d, want %d", len(caches), len(m.Layers))
+	}
+
+	if _, ok := caches[0].(*cache.RecurrentCache); !ok {
+		t.Fatalf("cache[0] = %T, want *cache.RecurrentCache", caches[0])
+	}
+	if _, ok := caches[1].(*cache.KVCache); !ok {
+		t.Fatalf("cache[1] = %T, want *cache.KVCache", caches[1])
+	}
+	if _, ok := caches[2].(*cache.RecurrentCache); !ok {
+		t.Fatalf("cache[2] = %T, want *cache.RecurrentCache", caches[2])
+	}
+}
--- a/x/models/qwen3_5_moe/qwen3_5_moe.go
+++ b/x/models/qwen3_5_moe/qwen3_5_moe.go
@@ -0,0 +1,16 @@
+//go:build mlx
+
+// Package qwen3_5_moe registers Qwen 3.5 MoE architecture aliases.
+package qwen3_5_moe
+
+import (
+	"github.com/ollama/ollama/x/mlxrunner/model/base"
+	"github.com/ollama/ollama/x/models/qwen3_5"
+)
+
+func init() {
+	base.Register("Qwen3_5MoeForConditionalGeneration", qwen3_5.NewModel)
+	base.Register("Qwen3_5MoeForCausalLM", qwen3_5.NewModel)
+	base.Register("Qwen3NextMoeForConditionalGeneration", qwen3_5.NewModel)
+	base.Register("Qwen3NextMoeForCausalLM", qwen3_5.NewModel)
+}
--- a/x/tokenizer/tokenizer.go
+++ b/x/tokenizer/tokenizer.go
@@ -0,0 +1,108 @@
+//go:build mlx
+
+// tokenizer.go - BPE and SentencePiece tokenizer for HuggingFace models
+//
+// Based on standard BPE algorithm (Sennrich et al. 2015) with:
+// - GPT-2 byte-level encoding (OpenAI tiktoken)
+// - HuggingFace tokenizer.json pretokenizer patterns
+// - SentencePiece ▁-style space handling
+
+package tokenizer
+
+import "regexp"
+
+// TokenizerType identifies the tokenization algorithm
+type TokenizerType int
+
+const (
+	TokenizerBPE           TokenizerType = iota // GPT-2 style byte-level BPE
+	TokenizerSentencePiece                      // SentencePiece with ▁ for spaces
+)
+
+// Vocabulary holds the tokenizer vocabulary and merges
+type Vocabulary struct {
+	Values  []string
+	Reverse map[string]int32
+	Merges  map[string]int
+
+	BOS    int32
+	EOS    []int32 // Multiple EOS tokens supported (e.g., Gemma has <eos> and <end_of_turn>)
+	PAD    int32   // Padding token (often <|endoftext|> or <pad>)
+	AddBOS bool
+	AddEOS bool
+
+	// Precomputed byte token IDs for <0xNN> fallback (256 entries, -1 if not found)
+	byteTokens [256]int32
+}
+
+// Tokenizer handles BPE and SentencePiece tokenization
+type Tokenizer struct {
+	vocab               *Vocabulary
+	pretokenizer        *regexp.Regexp
+	specialTokens       map[string]int32 // Special tokens for direct lookup
+	sortedSpecialTokens []string         // Special tokens sorted by length, longest first
+	typ                 TokenizerType    // Algorithm type
+}
+
+// Precomputed GPT-2 byte-level encoding table
+// Maps byte values to their encoded rune equivalents
+var byteToRune [256]rune
+
+func init() {
+	for b := 0; b < 256; b++ {
+		r := rune(b)
+		switch {
+		case r == 0x00ad:
+			r = 0x0143
+		case r <= 0x0020:
+			r = r + 0x0100
+		case r >= 0x007f && r <= 0x00a0:
+			r = r + 0x00a2
+		}
+		byteToRune[b] = r
+	}
+}
+
+// VocabSize returns the vocabulary size
+func (t *Tokenizer) VocabSize() int {
+	return len(t.vocab.Values)
+}
+
+// BOS returns the beginning of sequence token ID
+func (t *Tokenizer) BOS() int32 {
+	return t.vocab.BOS
+}
+
+// EOS returns the first end of sequence token ID (for backwards compatibility)
+func (t *Tokenizer) EOS() int32 {
+	if len(t.vocab.EOS) > 0 {
+		return t.vocab.EOS[0]
+	}
+	return -1
+}
+
+// EOSTokens returns all end of sequence token IDs
+func (t *Tokenizer) EOSTokens() []int32 {
+	return t.vocab.EOS
+}
+
+// PAD returns the padding token ID, or -1 if not set
+func (t *Tokenizer) PAD() int32 {
+	return t.vocab.PAD
+}
+
+// IsEOS returns true if the token ID is an end of sequence token
+func (t *Tokenizer) IsEOS(id int32) bool {
+	for _, eos := range t.vocab.EOS {
+		if id == eos {
+			return true
+		}
+	}
+	return false
+}
+
+// GetSpecialToken returns the token ID for a special token string
+func (t *Tokenizer) GetSpecialToken(name string) (int32, bool) {
+	id, ok := t.specialTokens[name]
+	return id, ok
+}
--- a/x/tokenizer/tokenizer_benchmark_test.go
+++ b/x/tokenizer/tokenizer_benchmark_test.go
@@ -0,0 +1,251 @@
+//go:build mlx
+
+package tokenizer
+
+import (
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"testing"
+)
+
+var (
+	benchmarkSinkIDs []int32
+	benchmarkSinkStr string
+	benchmarkSinkTok *Tokenizer
+)
+
+const benchmarkWordPieceJSON = `{
+  "model": {
+    "type": "WordPiece",
+    "vocab": {
+      "[UNK]": 0,
+      "hello": 1,
+      "##world": 2,
+      "##ly": 3,
+      "##hello": 4
+    }
+  },
+  "added_tokens": []
+}`
+
+const benchmarkSentencePieceJSON = `{
+  "model": {
+    "type": "BPE",
+    "vocab": {
+      "\u2581": 0,
+      "h": 1,
+      "e": 2,
+      "l": 3,
+      "o": 4,
+      "w": 5,
+      "r": 6,
+      "d": 7,
+      "<0x0A>": 8
+    },
+    "merges": []
+  },
+  "decoder": {
+    "type": "Sequence",
+    "decoders": [
+      {
+        "type": "Replace",
+        "pattern": {
+          "String": "\u2581"
+        }
+      }
+    ]
+  },
+  "added_tokens": []
+}`
+
+func benchmarkMiniLlamaPath(tb testing.TB) string {
+	tb.Helper()
+
+	_, filename, _, ok := runtime.Caller(0)
+	if !ok {
+		tb.Fatal("failed to resolve benchmark file path")
+	}
+
+	return filepath.Join(filepath.Dir(filename), "..", "imagegen", "tokenizer", "testdata", "mini_llama.json")
+}
+
+func benchmarkLoadMiniLlama(tb testing.TB) *Tokenizer {
+	tb.Helper()
+
+	data := benchmarkLoadMiniLlamaBytes(tb)
+	tok, err := LoadFromBytes(data)
+	if err != nil {
+		tb.Fatalf("failed to load mini llama tokenizer: %v", err)
+	}
+	return tok
+}
+
+func benchmarkLoadMiniLlamaBytes(tb testing.TB) []byte {
+	tb.Helper()
+
+	data, err := os.ReadFile(benchmarkMiniLlamaPath(tb))
+	if err != nil {
+		tb.Fatalf("failed to read mini llama tokenizer: %v", err)
+	}
+	return data
+}
+
+func benchmarkLoadFromBytes(tb testing.TB, data []byte) *Tokenizer {
+	tb.Helper()
+
+	tok, err := LoadFromBytes(data)
+	if err != nil {
+		tb.Fatalf("failed to load tokenizer from bytes: %v", err)
+	}
+	return tok
+}
+
+func BenchmarkTokenizerEncodeBPE(b *testing.B) {
+	tok := benchmarkLoadMiniLlama(b)
+
+	inputs := []struct {
+		name string
+		text string
+	}{
+		{name: "short", text: "Hello, world!"},
+		{name: "medium", text: strings.Repeat("The quick brown fox jumps over the lazy dog. ", 16)},
+		{name: "long_sequential", text: strings.Repeat("The quick brown fox jumps over the lazy dog. ", 80)},
+		{name: "long_parallel", text: strings.Repeat("The quick brown fox jumps over the lazy dog. ", 160)},
+		{name: "huge_parallel", text: strings.Repeat("The quick brown fox jumps over the lazy dog. ", 640)},
+		{name: "special_tokens", text: "<|begin_of_text|>system\nYou are concise.<|end_of_text|>"},
+	}
+
+	for _, input := range inputs {
+		b.Run(input.name, func(b *testing.B) {
+			b.ReportAllocs()
+			b.SetBytes(int64(len(input.text)))
+			b.ResetTimer()
+
+			for i := 0; i < b.N; i++ {
+				benchmarkSinkIDs = tok.Encode(input.text, false)
+			}
+		})
+	}
+}
+
+func BenchmarkTokenizerDecodeBPE(b *testing.B) {
+	tok := benchmarkLoadMiniLlama(b)
+
+	inputs := []struct {
+		name string
+		text string
+	}{
+		{name: "medium", text: strings.Repeat("The quick brown fox jumps over the lazy dog. ", 16)},
+		{name: "long", text: strings.Repeat("The quick brown fox jumps over the lazy dog. ", 160)},
+	}
+
+	for _, input := range inputs {
+		ids := tok.Encode(input.text, false)
+		b.Run(input.name, func(b *testing.B) {
+			b.ReportAllocs()
+			b.SetBytes(int64(len(input.text)))
+			b.ResetTimer()
+
+			for i := 0; i < b.N; i++ {
+				benchmarkSinkStr = tok.Decode(ids)
+			}
+		})
+	}
+}
+
+func BenchmarkTokenizerLoadFromBytes(b *testing.B) {
+	data := benchmarkLoadMiniLlamaBytes(b)
+
+	config := &TokenizerConfig{
+		TokenizerConfigJSON: []byte(`{
+			"bos_token": {"content": "<|begin_of_text|>"},
+			"eos_token": {"content": "<|end_of_text|>"},
+			"add_bos_token": true
+		}`),
+		GenerationConfigJSON: []byte(`{"bos_token_id": 128000, "eos_token_id": 128001}`),
+	}
+
+	b.Run("without_config", func(b *testing.B) {
+		b.ReportAllocs()
+		b.SetBytes(int64(len(data)))
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			tok, err := LoadFromBytes(data)
+			if err != nil {
+				b.Fatalf("LoadFromBytes failed: %v", err)
+			}
+			benchmarkSinkTok = tok
+		}
+	})
+
+	b.Run("with_config", func(b *testing.B) {
+		b.ReportAllocs()
+		b.SetBytes(int64(len(data)))
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			tok, err := LoadFromBytesWithConfig(data, config)
+			if err != nil {
+				b.Fatalf("LoadFromBytesWithConfig failed: %v", err)
+			}
+			benchmarkSinkTok = tok
+		}
+	})
+}
+
+func BenchmarkTokenizerEncodeWordPiece(b *testing.B) {
+	tok := benchmarkLoadFromBytes(b, []byte(benchmarkWordPieceJSON))
+	text := strings.Repeat("helloworldly", 16)
+
+	b.ReportAllocs()
+	b.SetBytes(int64(len(text)))
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		benchmarkSinkIDs = tok.Encode(text, false)
+	}
+}
+
+func BenchmarkTokenizerDecodeWordPiece(b *testing.B) {
+	tok := benchmarkLoadFromBytes(b, []byte(benchmarkWordPieceJSON))
+	text := strings.Repeat("helloworldly", 16)
+	ids := tok.Encode(text, false)
+
+	b.ReportAllocs()
+	b.SetBytes(int64(len(text)))
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		benchmarkSinkStr = tok.Decode(ids)
+	}
+}
+
+func BenchmarkTokenizerEncodeSentencePiece(b *testing.B) {
+	tok := benchmarkLoadFromBytes(b, []byte(benchmarkSentencePieceJSON))
+	text := strings.Repeat("hello world\n", 64)
+
+	b.ReportAllocs()
+	b.SetBytes(int64(len(text)))
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		benchmarkSinkIDs = tok.Encode(text, false)
+	}
+}
+
+func BenchmarkTokenizerDecodeSentencePiece(b *testing.B) {
+	tok := benchmarkLoadFromBytes(b, []byte(benchmarkSentencePieceJSON))
+	text := strings.Repeat("hello world\n", 64)
+	ids := tok.Encode(text, false)
+
+	b.ReportAllocs()
+	b.SetBytes(int64(len(text)))
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		benchmarkSinkStr = tok.Decode(ids)
+	}
+}
--- a/x/tokenizer/tokenizer_bpe.go
+++ b/x/tokenizer/tokenizer_bpe.go
@@ -0,0 +1,175 @@
+//go:build mlx
+
+package tokenizer
+
+import "container/heap"
+
+type bpeMergeNode struct {
+	prev  int
+	next  int
+	token string
+}
+
+type bpePair struct {
+	left  int
+	right int
+	rank  int
+	value string
+}
+
+type bpePairHeap []*bpePair
+
+func (h bpePairHeap) Len() int { return len(h) }
+
+func (h bpePairHeap) Less(i, j int) bool {
+	return h[i].rank < h[j].rank || (h[i].rank == h[j].rank && h[i].left < h[j].left)
+}
+
+func (h bpePairHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
+
+func (h *bpePairHeap) Push(x any) {
+	*h = append(*h, x.(*bpePair))
+}
+
+func (h *bpePairHeap) Pop() any {
+	old := *h
+	n := len(old)
+	item := old[n-1]
+	*h = old[:n-1]
+	return item
+}
+
+// encodeBPEMerge encodes using BPE merge algorithm.
+// Uses the heap/linked-list pair merge strategy from tokenizer/bytepairencoding.go:
+// merge the lowest-rank valid pair, then only recheck adjacent pairs.
+func (t *Tokenizer) encodeBPEMerge(encoded string, ids []int32) []int32 {
+	runes := []rune(encoded)
+	if len(runes) == 0 {
+		return ids
+	}
+
+	nodes := make([]bpeMergeNode, len(runes))
+	for i := range runes {
+		nodes[i] = bpeMergeNode{
+			prev:  i - 1,
+			next:  i + 1,
+			token: string(runes[i]),
+		}
+	}
+
+	pairwise := func(left, right int) *bpePair {
+		if left < 0 || right >= len(nodes) {
+			return nil
+		}
+		if nodes[left].token == "" || nodes[right].token == "" {
+			return nil
+		}
+
+		leftToken, rightToken := nodes[left].token, nodes[right].token
+		rank, ok := t.vocab.Merges[leftToken+" "+rightToken]
+		if !ok {
+			return nil
+		}
+
+		value := leftToken + rightToken
+		if _, ok := t.vocab.Reverse[value]; !ok {
+			return nil
+		}
+
+		return &bpePair{
+			left:  left,
+			right: right,
+			rank:  rank,
+			value: value,
+		}
+	}
+
+	pairs := bpePairHeap{}
+	heap.Init(&pairs)
+	for i := 0; i < len(runes)-1; i++ {
+		if pair := pairwise(i, i+1); pair != nil {
+			heap.Push(&pairs, pair)
+		}
+	}
+
+	for pairs.Len() > 0 {
+		pair := heap.Pop(&pairs).(*bpePair)
+		left, right := nodes[pair.left], nodes[pair.right]
+		if left.token == "" || right.token == "" {
+			continue
+		}
+		if left.next != pair.right || right.prev != pair.left {
+			continue
+		}
+		if left.token+right.token != pair.value {
+			continue
+		}
+
+		nodes[pair.left].token = pair.value
+		nodes[pair.right].token = ""
+		nodes[pair.left].next = right.next
+		if right.next < len(nodes) {
+			nodes[right.next].prev = pair.left
+		}
+
+		if pair := pairwise(nodes[pair.left].prev, pair.left); pair != nil {
+			heap.Push(&pairs, pair)
+		}
+		if pair := pairwise(pair.left, nodes[pair.left].next); pair != nil {
+			heap.Push(&pairs, pair)
+		}
+	}
+
+	for _, node := range nodes {
+		if node.token == "" {
+			continue
+		}
+
+		if id, ok := t.vocab.Reverse[node.token]; ok {
+			ids = append(ids, id)
+			continue
+		}
+
+		ids = t.appendByteFallback(ids, node.token)
+	}
+
+	return ids
+}
+
+func (t *Tokenizer) appendByteFallback(ids []int32, token string) []int32 {
+	if t.typ == TokenizerBPE {
+		for _, r := range token {
+			if b, ok := decodeByteLevelRune(r); ok {
+				if id := t.vocab.byteTokens[b]; id >= 0 {
+					ids = append(ids, id)
+				}
+			}
+		}
+		return ids
+	}
+
+	// SentencePiece fallback uses the UTF-8 bytes for <0xNN> tokens.
+	for _, b := range []byte(token) {
+		if id := t.vocab.byteTokens[b]; id >= 0 {
+			ids = append(ids, id)
+		}
+	}
+	return ids
+}
+
+func decodeByteLevelRune(r rune) (byte, bool) {
+	switch {
+	case r >= 0x00 && r <= 0xFF:
+		return byte(r), true
+	case r == 0x0100:
+		return 0x00, true
+	case r == 0x0143:
+		return 0x00ad, true
+	case r > 0x0100 && r <= 0x0120:
+		return byte(r - 0x0100), true
+	case r > 0x0120 && r <= 0x0142:
+		return byte(r - 0x00a2), true
+	default:
+		return 0, false
+	}
+}
--- a/x/tokenizer/tokenizer_correctness_test.go
+++ b/x/tokenizer/tokenizer_correctness_test.go
@@ -0,0 +1,137 @@
+//go:build mlx
+
+package tokenizer
+
+import (
+	"runtime"
+	"strings"
+	"testing"
+)
+
+func equalIDs(a, b []int32) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func TestEncodeRoundtripMiniLlama(t *testing.T) {
+	tok := benchmarkLoadMiniLlama(t)
+
+	inputs := []string{
+		"",
+		"hello",
+		"hello world",
+		" hello  world ",
+		"don't we'll they're",
+		"1234567890",
+		"こんにちは世界",
+		"Hello 世界",
+		"func main() {}",
+		"<|begin_of_text|>system\nYou are concise.<|end_of_text|>",
+		strings.Repeat("The quick brown fox jumps over the lazy dog. ", 32),
+	}
+
+	for _, input := range inputs {
+		ids := tok.Encode(input, false)
+		got := tok.Decode(ids)
+		if got != input {
+			t.Fatalf("roundtrip mismatch for %q: got %q", input, got)
+		}
+	}
+}
+
+func TestSplitBySpecialTokensGreedyLongest(t *testing.T) {
+	data := []byte(`{
+		"model": {
+			"type": "BPE",
+			"vocab": {"a": 0, "b": 1},
+			"merges": []
+		},
+		"added_tokens": [
+			{"id": 2, "content": "<tag>", "special": true},
+			{"id": 3, "content": "<tag>x", "special": true}
+		]
+	}`)
+
+	tok, err := LoadFromBytes(data)
+	if err != nil {
+		t.Fatalf("failed to load tokenizer: %v", err)
+	}
+
+	input := "a<tag>xb"
+	want := []string{"a", "<tag>x", "b"}
+
+	got := tok.splitBySpecialTokens(input)
+	if len(got) != len(want) {
+		t.Fatalf("split length mismatch: got %v want %v", got, want)
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("split mismatch at %d: got %v want %v", i, got, want)
+		}
+	}
+}
+
+func TestSplitBySpecialTokensFallbackWithoutCache(t *testing.T) {
+	data := []byte(`{
+		"model": {
+			"type": "BPE",
+			"vocab": {"a": 0, "b": 1},
+			"merges": []
+		},
+		"added_tokens": [
+			{"id": 2, "content": "<tag>", "special": true},
+			{"id": 3, "content": "<tag>x", "special": true}
+		]
+	}`)
+
+	tok, err := LoadFromBytes(data)
+	if err != nil {
+		t.Fatalf("failed to load tokenizer: %v", err)
+	}
+
+	input := "a<tag>xb"
+	want := []string{"a", "<tag>x", "b"}
+
+	// Simulate construction outside loader path where cache is not set.
+	tok.sortedSpecialTokens = nil
+
+	got := tok.splitBySpecialTokens(input)
+	if len(got) != len(want) {
+		t.Fatalf("split length mismatch: got %v want %v", got, want)
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("split mismatch at %d: got %v want %v", i, got, want)
+		}
+	}
+}
+
+func TestEncodeDeterministicAcrossGOMAXPROCS(t *testing.T) {
+	tok := benchmarkLoadMiniLlama(t)
+
+	input := strings.Repeat("The quick brown fox jumps over the lazy dog. ", 640)
+
+	prev := runtime.GOMAXPROCS(0)
+	defer runtime.GOMAXPROCS(prev)
+
+	runtime.GOMAXPROCS(1)
+	seq := tok.Encode(input, false)
+
+	if prev < 2 {
+		runtime.GOMAXPROCS(2)
+	} else {
+		runtime.GOMAXPROCS(prev)
+	}
+	par := tok.Encode(input, false)
+
+	if !equalIDs(seq, par) {
+		t.Fatalf("encode mismatch between sequential and parallel paths: seq=%d par=%d", len(seq), len(par))
+	}
+}
--- a/x/tokenizer/tokenizer_decode.go
+++ b/x/tokenizer/tokenizer_decode.go
@@ -0,0 +1,56 @@
+//go:build mlx
+
+package tokenizer
+
+import (
+	"strconv"
+	"strings"
+)
+
+// Decode converts token IDs back to text
+func (t *Tokenizer) Decode(ids []int32) string {
+	var sb strings.Builder
+
+	for _, id := range ids {
+		if int(id) >= len(t.vocab.Values) {
+			continue
+		}
+
+		token := t.vocab.Values[id]
+
+		switch t.typ {
+		case TokenizerSentencePiece:
+			// SentencePiece style: replace ▁ with space, decode byte tokens
+			token = strings.ReplaceAll(token, "▁", " ")
+			// Handle byte fallback tokens like <0x0D>
+			if len(token) == 6 && token[0] == '<' && token[1] == '0' && token[2] == 'x' && token[5] == '>' {
+				if v, err := strconv.ParseUint(token[3:5], 16, 8); err == nil {
+					sb.WriteByte(byte(v))
+					continue
+				}
+			}
+			sb.WriteString(token)
+		default:
+			// GPT-2 BPE style: decode byte-level encoding
+			for _, r := range token {
+				switch {
+				case r == 0x0100:
+					// Mirror GGML tokenizer behavior for NULL byte.
+					// 0x00 is omitted during decode.
+					continue
+				case r == 0x0143:
+					r = 0x00ad
+				case r > 0x0100 && r <= 0x0120:
+					r = r - 0x0100
+				case r > 0x0120 && r <= 0x0142:
+					r = r - 0x00a2
+				}
+
+				// Write as byte, not UTF-8 encoded rune
+				sb.WriteByte(byte(r))
+			}
+		}
+	}
+
+	return sb.String()
+}
--- a/x/tokenizer/tokenizer_encode.go
+++ b/x/tokenizer/tokenizer_encode.go
@@ -0,0 +1,289 @@
+//go:build mlx
+
+package tokenizer
+
+import (
+	"runtime"
+	"sort"
+	"strings"
+	"sync"
+	"unicode"
+	"unicode/utf8"
+)
+
+const (
+	encodeParallelMinInputBytes      = 4 * 1024
+	encodeParallelMinChunksPerWorker = 8
+)
+
+type tokenMatch struct {
+	start int
+	end   int
+}
+
+type encodeChunk struct {
+	text      string
+	isSpecial bool
+}
+
+// isNonNewlineWhitespace returns true if s contains only whitespace characters (no newlines)
+func isNonNewlineWhitespace(s string) bool {
+	if s == "" {
+		return false
+	}
+	for _, r := range s {
+		if r == '\n' || r == '\r' {
+			return false
+		}
+		if !unicode.IsSpace(r) {
+			return false
+		}
+	}
+	return true
+}
+
+// splitBySpecialTokens splits text into parts, keeping special tokens as separate elements
+func (t *Tokenizer) splitBySpecialTokens(s string) []string {
+	if len(t.specialTokens) == 0 {
+		return []string{s}
+	}
+
+	tokens := t.sortedSpecialTokens
+	if len(tokens) == 0 {
+		// Fallback for tokenizers constructed outside the loaders.
+		tokens = make([]string, 0, len(t.specialTokens))
+		for tok := range t.specialTokens {
+			tokens = append(tokens, tok)
+		}
+		sort.Slice(tokens, func(i, j int) bool {
+			return len(tokens[i]) > len(tokens[j])
+		})
+	}
+
+	var result []string
+	remaining := s
+
+	for len(remaining) > 0 {
+		found := false
+		for _, tok := range tokens {
+			if strings.HasPrefix(remaining, tok) {
+				result = append(result, tok)
+				remaining = remaining[len(tok):]
+				found = true
+				break
+			}
+		}
+		if !found {
+			// Find next special token position
+			nextPos := len(remaining)
+			for _, tok := range tokens {
+				if idx := strings.Index(remaining, tok); idx != -1 && idx < nextPos {
+					nextPos = idx
+				}
+			}
+			if nextPos > 0 {
+				result = append(result, remaining[:nextPos])
+			}
+			remaining = remaining[nextPos:]
+		}
+	}
+
+	return result
+}
+
+func adjustWhitespaceBoundary(part string, curr, next *tokenMatch) {
+	m := part[curr.start:curr.end]
+	nextText := part[next.start:next.end]
+
+	if !isNonNewlineWhitespace(m) || len(nextText) == 0 {
+		return
+	}
+
+	firstRune, _ := utf8.DecodeRuneInString(nextText)
+	if !unicode.IsLetter(firstRune) {
+		return
+	}
+
+	lastSpaceStart := curr.end
+	for j := curr.end; j > curr.start; {
+		r, size := utf8.DecodeLastRuneInString(part[curr.start:j])
+		if unicode.IsSpace(r) {
+			lastSpaceStart = j - size
+			break
+		}
+		j -= size
+	}
+	if lastSpaceStart > curr.start {
+		curr.end = lastSpaceStart
+		next.start = lastSpaceStart
+	} else {
+		next.start = curr.start
+		curr.end = curr.start
+	}
+}
+
+func (t *Tokenizer) forEachPartChunk(part string, fn func(encodeChunk)) {
+	if _, ok := t.specialTokens[part]; ok {
+		fn(encodeChunk{text: part, isSpecial: true})
+		return
+	}
+
+	if t.pretokenizer == nil {
+		fn(encodeChunk{text: part, isSpecial: false})
+		return
+	}
+
+	re := t.pretokenizer
+	offset := 0
+	loc := re.FindStringIndex(part[offset:])
+	if loc == nil {
+		return
+	}
+
+	curr := tokenMatch{start: offset + loc[0], end: offset + loc[1]}
+	offset += loc[1]
+
+	for {
+		loc = re.FindStringIndex(part[offset:])
+		if loc == nil {
+			if curr.end > curr.start {
+				fn(encodeChunk{text: part[curr.start:curr.end], isSpecial: false})
+			}
+			return
+		}
+
+		next := tokenMatch{start: offset + loc[0], end: offset + loc[1]}
+		offset += loc[1]
+
+		adjustWhitespaceBoundary(part, &curr, &next)
+
+		if curr.end > curr.start {
+			fn(encodeChunk{text: part[curr.start:curr.end], isSpecial: false})
+		}
+		curr = next
+	}
+}
+
+func (t *Tokenizer) appendEncodedChunk(ids []int32, c encodeChunk) []int32 {
+	if c.isSpecial {
+		if id, ok := t.specialTokens[c.text]; ok {
+			return append(ids, id)
+		}
+		return ids
+	}
+
+	return t.encodeChunkInto(c.text, ids)
+}
+
+// Encode tokenizes text to token IDs.
+// Parallel encoding is used only for very large inputs with enough chunks per worker.
+func (t *Tokenizer) Encode(s string, addBOS bool) []int32 {
+	// First: split by special tokens
+	parts := t.splitBySpecialTokens(s)
+
+	// Fast path: encode sequentially without materializing chunk slices.
+	if len(s) < encodeParallelMinInputBytes {
+		var ids []int32
+		for _, part := range parts {
+			t.forEachPartChunk(part, func(c encodeChunk) {
+				ids = t.appendEncodedChunk(ids, c)
+			})
+		}
+
+		if addBOS && t.vocab.BOS >= 0 {
+			ids = append([]int32{t.vocab.BOS}, ids...)
+		}
+		return ids
+	}
+
+	// For large inputs collect chunks to enable parallel processing.
+	var allChunks []encodeChunk
+	for _, part := range parts {
+		t.forEachPartChunk(part, func(c encodeChunk) {
+			allChunks = append(allChunks, c)
+		})
+	}
+
+	// Encode chunks. Use the parallel path only when the chunk count is
+	// large enough to amortize goroutine/synchronization overhead.
+	useParallel := true
+	numWorkers := runtime.GOMAXPROCS(0)
+	if numWorkers > len(allChunks) {
+		numWorkers = len(allChunks)
+	}
+	if numWorkers < 2 || len(allChunks) < numWorkers*encodeParallelMinChunksPerWorker {
+		useParallel = false
+	}
+
+	var ids []int32
+	if !useParallel {
+		for _, c := range allChunks {
+			ids = t.appendEncodedChunk(ids, c)
+		}
+	} else {
+		chunksPer := (len(allChunks) + numWorkers - 1) / numWorkers
+		results := make([][]int32, numWorkers)
+		var wg sync.WaitGroup
+
+		for i := 0; i < numWorkers; i++ {
+			start := i * chunksPer
+			end := start + chunksPer
+			if end > len(allChunks) {
+				end = len(allChunks)
+			}
+			if start >= end {
+				continue
+			}
+
+			wg.Add(1)
+			go func(i int, chunks []encodeChunk) {
+				defer wg.Done()
+				var r []int32
+				for _, c := range chunks {
+					r = t.appendEncodedChunk(r, c)
+				}
+				results[i] = r
+			}(i, allChunks[start:end])
+		}
+		wg.Wait()
+
+		for _, r := range results {
+			ids = append(ids, r...)
+		}
+	}
+
+	if addBOS && t.vocab.BOS >= 0 {
+		ids = append([]int32{t.vocab.BOS}, ids...)
+	}
+	return ids
+}
+
+// encodeChunkInto appends encoded tokens to ids and returns the extended slice.
+// Uses BPE merge algorithm for both BPE and SentencePiece tokenization.
+func (t *Tokenizer) encodeChunkInto(s string, ids []int32) []int32 {
+	if s == "" {
+		return ids
+	}
+
+	// Apply encoding transformation
+	// SentencePiece: replace space with ▁
+	// BPE: convert bytes using precomputed table (GPT-2 byte-level encoding)
+	var encoded string
+	if t.typ == TokenizerSentencePiece {
+		encoded = strings.ReplaceAll(s, " ", "▁")
+	} else {
+		var sb strings.Builder
+		sb.Grow(len(s) * 2)
+		for i := 0; i < len(s); i++ {
+			sb.WriteRune(byteToRune[s[i]])
+		}
+		encoded = sb.String()
+	}
+
+	// Fast path: check if entire chunk is a single token
+	if id, ok := t.vocab.Reverse[encoded]; ok {
+		return append(ids, id)
+	}
+
+	return t.encodeBPEMerge(encoded, ids)
+}
--- a/x/tokenizer/tokenizer_ggml_parity_test.go
+++ b/x/tokenizer/tokenizer_ggml_parity_test.go
@@ -0,0 +1,207 @@
+//go:build mlx
+
+package tokenizer
+
+import (
+	"bufio"
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"testing"
+)
+
+func llama32GGMLFixturePath(tb testing.TB, file string) string {
+	tb.Helper()
+
+	_, filename, _, ok := runtime.Caller(0)
+	if !ok {
+		tb.Fatal("failed to resolve test file path")
+	}
+
+	return filepath.Join(filepath.Dir(filename), "..", "..", "tokenizer", "testdata", "llama3.2", file)
+}
+
+func loadLlama32FromGGMLFixture(tb testing.TB) *Tokenizer {
+	tb.Helper()
+
+	f, err := os.Open(llama32GGMLFixturePath(tb, "encoder.json"))
+	if err != nil {
+		tb.Fatalf("failed to open encoder.json: %v", err)
+	}
+	defer f.Close()
+
+	vocab := make(map[string]int32)
+	if err := json.NewDecoder(f).Decode(&vocab); err != nil {
+		tb.Fatalf("failed to decode encoder.json: %v", err)
+	}
+
+	type addedToken struct {
+		ID      int32  `json:"id"`
+		Content string `json:"content"`
+		Special bool   `json:"special"`
+	}
+	var addedTokens []addedToken
+	for _, token := range []string{"<|begin_of_text|>", "<|end_of_text|>"} {
+		if _, ok := vocab[token]; !ok {
+			id := int32(len(vocab))
+			vocab[token] = id
+			addedTokens = append(addedTokens, addedToken{ID: id, Content: token, Special: true})
+		}
+	}
+
+	mf, err := os.Open(llama32GGMLFixturePath(tb, "vocab.bpe"))
+	if err != nil {
+		tb.Fatalf("failed to open vocab.bpe: %v", err)
+	}
+	defer mf.Close()
+
+	var merges []string
+	scanner := bufio.NewScanner(mf)
+	for scanner.Scan() {
+		line := scanner.Text()
+		if strings.HasPrefix(line, "#") {
+			continue
+		}
+		line = strings.TrimSpace(line)
+		if line != "" {
+			merges = append(merges, line)
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		tb.Fatalf("failed to read vocab.bpe: %v", err)
+	}
+
+	payload := struct {
+		Model struct {
+			Type   string           `json:"type"`
+			Vocab  map[string]int32 `json:"vocab"`
+			Merges []string         `json:"merges"`
+		} `json:"model"`
+		PreTokenizer struct {
+			Type          string `json:"type"`
+			Pretokenizers []struct {
+				Type    string `json:"type"`
+				Pattern struct {
+					Regex string `json:"Regex"`
+				} `json:"pattern"`
+			} `json:"pretokenizers"`
+		} `json:"pre_tokenizer"`
+		AddedTokens []addedToken `json:"added_tokens"`
+	}{}
+
+	payload.Model.Type = "BPE"
+	payload.Model.Vocab = vocab
+	payload.Model.Merges = merges
+	payload.PreTokenizer.Type = "Sequence"
+	payload.PreTokenizer.Pretokenizers = []struct {
+		Type    string `json:"type"`
+		Pattern struct {
+			Regex string `json:"Regex"`
+		} `json:"pattern"`
+	}{
+		{
+			Type: "Split",
+			Pattern: struct {
+				Regex string `json:"Regex"`
+			}{
+				Regex: `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+			},
+		},
+	}
+	payload.AddedTokens = addedTokens
+
+	data, err := json.Marshal(payload)
+	if err != nil {
+		tb.Fatalf("failed to marshal synthetic tokenizer.json: %v", err)
+	}
+
+	tok, err := LoadFromBytes(data)
+	if err != nil {
+		tb.Fatalf("failed to load tokenizer from fixture data: %v", err)
+	}
+	return tok
+}
+
+func TestGGMLLlamaKnownEncodings(t *testing.T) {
+	tok := loadLlama32FromGGMLFixture(t)
+
+	cases := map[string][]int32{
+		"hello world":                                          {15339, 1917},
+		"hello <|end_of_text|>":                                {15339, 220, 128001},
+		"<|begin_of_text|>A B!":                                {128000, 32, 426, 0},
+		"<|begin_of_text|>A<|end_of_text|>B!":                  {128000, 32, 128001, 33, 0},
+		"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!": {128000, 32, 128001, 33, 128000, 0},
+		"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!<|end_of_text|>": {128000, 32, 128001, 33, 128000, 0, 128001},
+	}
+
+	for input, want := range cases {
+		got := tok.Encode(input, false)
+		if !equalIDs(got, want) {
+			t.Fatalf("encode mismatch for %q:\n got:  %v\n want: %v", input, got, want)
+		}
+	}
+}
+
+func TestGGMLLlamaRepeatedZeros(t *testing.T) {
+	tok := loadLlama32FromGGMLFixture(t)
+
+	cases := map[int][]int32{
+		1:  {15},
+		2:  {410},
+		3:  {931},
+		4:  {931, 15},
+		5:  {931, 410},
+		6:  {931, 931},
+		7:  {931, 931, 15},
+		8:  {931, 931, 410},
+		9:  {931, 931, 931},
+		10: {931, 931, 931, 15},
+		11: {931, 931, 931, 410},
+		12: {931, 931, 931, 931},
+		13: {931, 931, 931, 931, 15},
+		14: {931, 931, 931, 931, 410},
+		15: {931, 931, 931, 931, 931},
+		16: {931, 931, 931, 931, 931, 15},
+		17: {931, 931, 931, 931, 931, 410},
+	}
+
+	for n, want := range cases {
+		input := strings.Repeat("0", n)
+		got := tok.Encode(input, false)
+		if !equalIDs(got, want) {
+			t.Fatalf("encode mismatch for %q:\n got:  %v\n want: %v", input, got, want)
+		}
+	}
+}
+
+func TestGGMLLlamaRoundtripAndByteBehavior(t *testing.T) {
+	tok := loadLlama32FromGGMLFixture(t)
+
+	cases := []string{
+		"hello",
+		"hello ",
+		"hello  ",
+		" hello",
+		" hello ",
+		" hello  ",
+		"hello world",
+		"请考试我的软件！12345",
+	}
+
+	for _, input := range cases {
+		ids := tok.Encode(input, false)
+		got := tok.Decode(ids)
+		if got != input {
+			t.Fatalf("roundtrip mismatch for %q: got %q", input, got)
+		}
+	}
+
+	// Match GGML tokenizer behavior: 0x00 is omitted when decoding.
+	ids := tok.Encode(string(rune(0x00)), false)
+	got := tok.Decode(ids)
+	if got != "" {
+		t.Fatalf("expected empty decode for 0x00, got %q (ids=%v)", got, ids)
+	}
+}
--- a/x/tokenizer/tokenizer_load.go
+++ b/x/tokenizer/tokenizer_load.go
@@ -0,0 +1,458 @@
+//go:build mlx
+
+package tokenizer
+
+import (
+	"encoding/json"
+	"fmt"
+	"regexp"
+	"sort"
+	"strings"
+)
+
+// TokenizerConfig holds optional configuration data that can be passed to LoadFromBytesWithConfig.
+type TokenizerConfig struct {
+	TokenizerConfigJSON  []byte // tokenizer_config.json content
+	GenerationConfigJSON []byte // generation_config.json content
+	SpecialTokensMapJSON []byte // special_tokens_map.json content
+	ConfigJSON           []byte // config.json content
+}
+
+// LoadFromBytes loads a tokenizer from tokenizer.json bytes.
+// This is useful when loading from blob storage where the file content is already in memory.
+// Note: This won't load special token config from companion files. Use LoadFromBytesWithConfig
+// to provide tokenizer_config.json data for proper PAD/EOS token loading.
+func LoadFromBytes(data []byte) (*Tokenizer, error) {
+	return loadFromTokenizerJSON(data)
+}
+
+// LoadFromBytesWithConfig loads a tokenizer from tokenizer.json bytes with additional config files.
+// This is useful when loading from blob storage where companion config files are also blobs.
+func LoadFromBytesWithConfig(data []byte, config *TokenizerConfig) (*Tokenizer, error) {
+	t, err := loadFromTokenizerJSON(data)
+	if err != nil {
+		return nil, err
+	}
+
+	if config == nil {
+		return t, nil
+	}
+
+	// Apply special token configs from provided data
+	loadSpecialTokenConfigFromBytes(t, config)
+
+	return t, nil
+}
+
+// loadFromTokenizerJSON parses tokenizer.json content from bytes.
+func loadFromTokenizerJSON(data []byte) (*Tokenizer, error) {
+
+	var raw struct {
+		Model struct {
+			Type   string           `json:"type"` // "BPE"
+			Vocab  map[string]int32 `json:"vocab"`
+			Merges json.RawMessage  `json:"merges"` // Can be []string or [][]string (BPE only)
+		} `json:"model"`
+		PreTokenizer json.RawMessage `json:"pre_tokenizer"`
+		Decoder      json.RawMessage `json:"decoder"`
+		AddedTokens  []struct {
+			ID      int32  `json:"id"`
+			Content string `json:"content"`
+			Special bool   `json:"special"`
+		} `json:"added_tokens"`
+	}
+
+	if err := json.Unmarshal(data, &raw); err != nil {
+		return nil, fmt.Errorf("failed to parse tokenizer: %w", err)
+	}
+
+	// Covers SentencePiece and BPE models
+	if raw.Model.Type != "BPE" {
+		return nil, fmt.Errorf("unsupported tokenizer type: %s", raw.Model.Type)
+	}
+
+	// Parse merges - can be []string (Llama) or [][]string (GPT-OSS).
+	var mergesStrings []string
+	if raw.Model.Merges != nil {
+		var mergesArrays [][]string
+		if err := json.Unmarshal(raw.Model.Merges, &mergesStrings); err != nil {
+			// Try array of arrays format
+			if err := json.Unmarshal(raw.Model.Merges, &mergesArrays); err != nil {
+				return nil, fmt.Errorf("failed to parse merges: %w", err)
+			}
+			// Convert [][]string to []string
+			mergesStrings = make([]string, len(mergesArrays))
+			for i, pair := range mergesArrays {
+				if len(pair) != 2 {
+					return nil, fmt.Errorf("failed to parse merges: expected merge pair of length 2, got %d", len(pair))
+				}
+				mergesStrings[i] = pair[0] + " " + pair[1]
+			}
+		}
+	}
+
+	// Build tokenizer
+	t := &Tokenizer{
+		vocab: &Vocabulary{
+			Values:  make([]string, len(raw.Model.Vocab)),
+			Reverse: raw.Model.Vocab,
+			Merges:  make(map[string]int, len(mergesStrings)),
+			BOS:     -1,
+			PAD:     -1,
+		},
+		specialTokens: make(map[string]int32),
+	}
+
+	// Build values array
+	for token, id := range raw.Model.Vocab {
+		if int(id) >= len(t.vocab.Values) {
+			newValues := make([]string, id+1)
+			copy(newValues, t.vocab.Values)
+			t.vocab.Values = newValues
+		}
+		t.vocab.Values[id] = token
+	}
+
+	// Build merges map
+	for i, merge := range mergesStrings {
+		t.vocab.Merges[merge] = i
+	}
+
+	// Add all added_tokens to vocabulary and special tokens map.
+	// HuggingFace treats ALL added_tokens as special for tokenization purposes -
+	// they bypass BPE and get their own token ID. The "special" flag just indicates
+	// if it's a "truly special" token like BOS/EOS/PAD, but for tokenization we need
+	// to treat all added_tokens as special to match HuggingFace behavior.
+	for _, tok := range raw.AddedTokens {
+		if int(tok.ID) >= len(t.vocab.Values) {
+			newValues := make([]string, tok.ID+1)
+			copy(newValues, t.vocab.Values)
+			t.vocab.Values = newValues
+		}
+		t.vocab.Values[tok.ID] = tok.Content
+		t.specialTokens[tok.Content] = tok.ID // Add ALL added_tokens to special tokens
+	}
+
+	// Precompute byte token IDs for <0xNN> fallback
+	initByteTokens(t)
+
+	// Determine tokenizer type
+	switch {
+	case detectSentencePiece(raw.Decoder):
+		t.typ = TokenizerSentencePiece
+	default:
+		t.typ = TokenizerBPE
+	}
+
+	// Parse and compile pretokenizer pattern (BPE only - SentencePiece doesn't use pretokenizer)
+	if t.typ == TokenizerBPE {
+		pattern := extractPretokenizer(raw.PreTokenizer)
+		if pattern == "" {
+			pattern = `'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`
+		}
+		re, err := regexp.Compile(rewritePatternForRE2(pattern))
+		if err != nil {
+			return nil, fmt.Errorf("failed to compile pretokenizer regex %q: %w", pattern, err)
+		}
+		t.pretokenizer = re
+	}
+
+	cacheSortedSpecialTokens(t)
+
+	return t, nil
+}
+
+func cacheSortedSpecialTokens(t *Tokenizer) {
+	if len(t.specialTokens) == 0 {
+		t.sortedSpecialTokens = nil
+		return
+	}
+
+	tokens := make([]string, 0, len(t.specialTokens))
+	for tok := range t.specialTokens {
+		tokens = append(tokens, tok)
+	}
+	sort.Slice(tokens, func(i, j int) bool {
+		return len(tokens[i]) > len(tokens[j])
+	})
+	t.sortedSpecialTokens = tokens
+}
+
+type specialTokenConfigData struct {
+	tokenizerConfigJSON  []byte
+	generationConfigJSON []byte
+	specialTokensMapJSON []byte
+	configJSON           []byte
+}
+
+func applySpecialTokenConfig(t *Tokenizer, config specialTokenConfigData) {
+	parseTokenIDs := func(v interface{}) []int32 {
+		switch val := v.(type) {
+		case float64:
+			return []int32{int32(val)}
+		case []interface{}:
+			ids := make([]int32, 0, len(val))
+			for _, id := range val {
+				if f, ok := id.(float64); ok {
+					ids = append(ids, int32(f))
+				}
+			}
+			return ids
+		}
+		return nil
+	}
+
+	// Priority 1: generation_config.json
+	if len(config.generationConfigJSON) > 0 {
+		var genConfig struct {
+			EOSTokenID interface{} `json:"eos_token_id"`
+			BOSTokenID interface{} `json:"bos_token_id"`
+		}
+		if err := json.Unmarshal(config.generationConfigJSON, &genConfig); err == nil {
+			if ids := parseTokenIDs(genConfig.EOSTokenID); len(ids) > 0 {
+				t.vocab.EOS = ids
+			}
+			if ids := parseTokenIDs(genConfig.BOSTokenID); len(ids) > 0 {
+				t.vocab.BOS = ids[0]
+			}
+		}
+	}
+
+	// Priority 2: config.json
+	if len(config.configJSON) > 0 && (len(t.vocab.EOS) == 0 || t.vocab.BOS < 0) {
+		var modelConfig struct {
+			EOSTokenID interface{} `json:"eos_token_id"`
+			BOSTokenID interface{} `json:"bos_token_id"`
+		}
+		if err := json.Unmarshal(config.configJSON, &modelConfig); err == nil {
+			if len(t.vocab.EOS) == 0 {
+				if ids := parseTokenIDs(modelConfig.EOSTokenID); len(ids) > 0 {
+					t.vocab.EOS = ids
+				}
+			}
+			if t.vocab.BOS < 0 {
+				if ids := parseTokenIDs(modelConfig.BOSTokenID); len(ids) > 0 {
+					t.vocab.BOS = ids[0]
+				}
+			}
+		}
+	}
+
+	// Priority 3: tokenizer_config.json
+	if len(config.tokenizerConfigJSON) > 0 {
+		var tokConfig struct {
+			BOSToken    interface{} `json:"bos_token"`
+			EOSToken    interface{} `json:"eos_token"`
+			PADToken    interface{} `json:"pad_token"`
+			AddBOSToken *bool       `json:"add_bos_token"`
+			AddEOSToken *bool       `json:"add_eos_token"`
+		}
+		if err := json.Unmarshal(config.tokenizerConfigJSON, &tokConfig); err == nil {
+			if t.vocab.BOS < 0 {
+				if bosStr := extractTokenString(tokConfig.BOSToken); bosStr != "" {
+					if id, ok := t.specialTokens[bosStr]; ok {
+						t.vocab.BOS = id
+					}
+				}
+			}
+			if len(t.vocab.EOS) == 0 {
+				if eosStr := extractTokenString(tokConfig.EOSToken); eosStr != "" {
+					if id, ok := t.specialTokens[eosStr]; ok {
+						t.vocab.EOS = []int32{id}
+					}
+				}
+			}
+			if t.vocab.PAD < 0 {
+				if padStr := extractTokenString(tokConfig.PADToken); padStr != "" {
+					if id, ok := t.specialTokens[padStr]; ok {
+						t.vocab.PAD = id
+					}
+				}
+			}
+			if tokConfig.AddBOSToken != nil {
+				t.vocab.AddBOS = *tokConfig.AddBOSToken
+			}
+			if tokConfig.AddEOSToken != nil {
+				t.vocab.AddEOS = *tokConfig.AddEOSToken
+			}
+		}
+	}
+
+	// Priority 4: special_tokens_map.json
+	if len(config.specialTokensMapJSON) > 0 {
+		var tokensMap map[string]interface{}
+		if err := json.Unmarshal(config.specialTokensMapJSON, &tokensMap); err == nil {
+			if t.vocab.BOS < 0 {
+				if bosStr := extractTokenString(tokensMap["bos_token"]); bosStr != "" {
+					if id, ok := t.specialTokens[bosStr]; ok {
+						t.vocab.BOS = id
+					}
+				}
+			}
+			if len(t.vocab.EOS) == 0 {
+				if eosStr := extractTokenString(tokensMap["eos_token"]); eosStr != "" {
+					if id, ok := t.specialTokens[eosStr]; ok {
+						t.vocab.EOS = []int32{id}
+					}
+				}
+			}
+			if t.vocab.PAD < 0 {
+				if padStr := extractTokenString(tokensMap["pad_token"]); padStr != "" {
+					if id, ok := t.specialTokens[padStr]; ok {
+						t.vocab.PAD = id
+					}
+				}
+			}
+		}
+	}
+}
+
+// extractTokenString extracts the token string from various formats used in HuggingFace configs.
+// Tokens can be represented as:
+//   - string: "token"
+//   - object: {"content": "token", ...}
+func extractTokenString(v interface{}) string {
+	if v == nil {
+		return ""
+	}
+	// Direct string
+	if s, ok := v.(string); ok {
+		return s
+	}
+	// Object with content field
+	if m, ok := v.(map[string]interface{}); ok {
+		if content, ok := m["content"].(string); ok {
+			return content
+		}
+	}
+	return ""
+}
+
+// rewritePatternForRE2 rewrites HuggingFace pretokenizer regex patterns to be
+// compatible with Go's regexp package (RE2). HuggingFace patterns use PCRE features:
+//   - (?!\S) negative lookahead - RE2 doesn't support this
+//   - (?i:...) inline case-insensitive groups - RE2 doesn't support this
+//
+// We replace \s+(?!\S)|\s+ with \s+ and fix whitespace boundaries in encodeWithRegex().
+// The lookahead version splits "a  b" into ["a", " ", " b"] (space prepended to word).
+// Simple \s+ would give ["a", "  ", "b"]. We post-process to match Python's behavior.
+func rewritePatternForRE2(pattern string) string {
+	// Replace lookahead pattern with simple \s+ - we fix boundaries in encodeWithRegex()
+	pattern = strings.ReplaceAll(pattern, `\s+(?!\S)|\s+`, `\s+`)
+
+	// Handle the pattern when it appears with a ? suffix (optional contractions in GPT-4o style)
+	// IMPORTANT: Must be done before the non-optional version to avoid partial replacement
+	pattern = strings.ReplaceAll(pattern,
+		`(?i:'s|'t|'re|'ve|'m|'ll|'d)?`,
+		`(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?`)
+
+	// Expand case-insensitive contraction pattern to explicit alternations
+	// (?i:'s|'t|'re|'ve|'m|'ll|'d) -> '[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD]
+	pattern = strings.ReplaceAll(pattern,
+		`(?i:'s|'t|'re|'ve|'m|'ll|'d)`,
+		`(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])`)
+
+	return pattern
+}
+
+// loadSpecialTokenConfigFromBytes loads special token configuration from byte slices.
+func loadSpecialTokenConfigFromBytes(t *Tokenizer, config *TokenizerConfig) {
+	applySpecialTokenConfig(t, specialTokenConfigData{
+		tokenizerConfigJSON:  config.TokenizerConfigJSON,
+		generationConfigJSON: config.GenerationConfigJSON,
+		specialTokensMapJSON: config.SpecialTokensMapJSON,
+		configJSON:           config.ConfigJSON,
+	})
+}
+
+// detectSentencePiece checks if the decoder uses SentencePiece-style (▁ for spaces)
+// vs GPT-2 byte-level encoding
+func detectSentencePiece(data json.RawMessage) bool {
+	if data == nil {
+		return false
+	}
+
+	// Check for Sequence decoder with Replace step (SentencePiece style)
+	var seq struct {
+		Type     string `json:"type"`
+		Decoders []struct {
+			Type    string `json:"type"`
+			Pattern struct {
+				String string `json:"String"`
+			} `json:"pattern"`
+		} `json:"decoders"`
+	}
+	if err := json.Unmarshal(data, &seq); err == nil {
+		if seq.Type == "Sequence" {
+			for _, dec := range seq.Decoders {
+				// Look for Replace decoder that converts ▁ to space
+				if dec.Type == "Replace" && dec.Pattern.String == "▁" {
+					return true
+				}
+			}
+		}
+	}
+
+	// Check for direct ByteLevel decoder (GPT-2 style)
+	var simple struct {
+		Type string `json:"type"`
+	}
+	if err := json.Unmarshal(data, &simple); err == nil {
+		if simple.Type == "ByteLevel" {
+			return false
+		}
+	}
+
+	return false
+}
+
+// initByteTokens precomputes byte token IDs for <0xNN> fallback encoding
+func initByteTokens(t *Tokenizer) {
+	for i := range t.vocab.byteTokens {
+		t.vocab.byteTokens[i] = -1
+	}
+	for b := 0; b < 256; b++ {
+		token := fmt.Sprintf("<0x%02X>", b)
+		if id, ok := t.vocab.Reverse[token]; ok {
+			t.vocab.byteTokens[b] = id
+		}
+	}
+}
+
+// extractPretokenizer extracts the regex pattern from the pre_tokenizer config
+func extractPretokenizer(data json.RawMessage) string {
+	if data == nil {
+		return ""
+	}
+
+	// Try to parse as a single Split pretokenizer
+	var single struct {
+		Type    string `json:"type"`
+		Pattern struct {
+			Regex string `json:"Regex"`
+		} `json:"pattern"`
+	}
+	if err := json.Unmarshal(data, &single); err == nil && single.Pattern.Regex != "" {
+		return single.Pattern.Regex
+	}
+
+	// Try to parse as Sequence of pretokenizers - use first Split pattern
+	var seq struct {
+		Type          string `json:"type"`
+		Pretokenizers []struct {
+			Type    string `json:"type"`
+			Pattern struct {
+				Regex string `json:"Regex"`
+			} `json:"pattern"`
+		} `json:"pretokenizers"`
+	}
+	if err := json.Unmarshal(data, &seq); err == nil && seq.Type == "Sequence" {
+		for _, pt := range seq.Pretokenizers {
+			if pt.Type == "Split" && pt.Pattern.Regex != "" {
+				return pt.Pattern.Regex
+			}
+		}
+	}
+
+	return ""
+}
--- a/x/tokenizer/tokenizer_load_test.go
+++ b/x/tokenizer/tokenizer_load_test.go
@@ -0,0 +1,26 @@
+//go:build mlx
+
+package tokenizer
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestLoadFromBytesRejectsWordPiece(t *testing.T) {
+	data := []byte(`{
+		"model": {
+			"type": "WordPiece",
+			"vocab": {"[UNK]": 0, "hello": 1}
+		},
+		"added_tokens": []
+	}`)
+
+	_, err := LoadFromBytes(data)
+	if err == nil {
+		t.Fatal("expected WordPiece load to fail")
+	}
+	if !strings.Contains(err.Error(), "unsupported tokenizer type: WordPiece") {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}